From 2adb6ef7cb3b87e7f4bbbb6d218c2a289d03ff29 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 26 Jan 2024 22:05:55 -0800
Subject: [PATCH 001/116] refactoring to suppport different executor backends +
 tokio backend

---
 Cargo.toml                                    |   4 +-
 .../async_comparison.rs                       |   8 +-
 examples/array_examples/array_put_get.rs      |   2 +-
 examples/array_examples/global_lock_array.rs  |   6 +-
 .../global_lock_atomic_array_put_bw.rs        |   3 +-
 .../local_lock_atomic_array_put_bw.rs         |   4 +-
 .../bandwidths/task_group_futures_am_bw.rs    |   2 +-
 examples/darc_examples/darc.rs                |   6 +-
 examples/darc_examples/string_darc.rs         |  29 +-
 examples/kernels/dft_proxy.rs                 |  12 +-
 .../safe_parallel_blocked_array_gemm.rs       |   2 +-
 src/active_messaging.rs                       |  90 ++-
 src/active_messaging/batching.rs              |  76 +--
 .../batching/simple_batcher.rs                | 188 +++---
 .../batching/team_am_batcher.rs               | 313 +++++-----
 .../registered_active_message.rs              | 100 ++-
 src/array.rs                                  |  36 +-
 src/array/atomic.rs                           |  10 +-
 src/array/generic_atomic.rs                   |  15 +-
 src/array/global_lock_atomic.rs               | 362 +++++------
 src/array/global_lock_atomic/iteration.rs     |  15 +-
 .../distributed_iterator/consumer/count.rs    |   4 +-
 .../distributed_iterator/consumer/reduce.rs   |   2 +-
 .../iterator/one_sided_iterator/buffered.rs   |   4 -
 src/array/local_lock_atomic.rs                | 354 ++++-------
 src/array/local_lock_atomic/iteration.rs      |  21 +-
 src/array/native_atomic.rs                    |  15 +-
 src/array/operations.rs                       |  32 +-
 src/array/read_only.rs                        |  15 +-
 src/array/unsafe.rs                           |  21 +-
 src/array/unsafe/operations.rs                | 142 ++---
 src/barrier.rs                                |  29 +-
 src/darc.rs                                   |  28 +-
 src/darc/global_rw_darc.rs                    | 407 ++++++-------
 src/darc/local_rw_darc.rs                     | 337 +++++------
 src/lamellae/command_queues.rs                |   5 +-
 src/lamellae/rofi_lamellae.rs                 |   2 +-
 src/lamellae/shmem_lamellae.rs                |   2 +-
 src/lamellar_request.rs                       |   2 +-
 src/lamellar_task_group.rs                    |   7 +-
 src/lamellar_team.rs                          |  12 +-
 src/lamellar_world.rs                         |  65 +-
 src/lib.rs                                    |   2 +-
 src/scheduler.rs                              | 340 ++++++++---
 src/scheduler/numa_work_stealing.rs           |   7 +-
 src/scheduler/numa_work_stealing2.rs          |   2 +-
 src/scheduler/tokio.rs                        |  88 +++
 src/scheduler/work_stealing.rs                | 567 ++----------------
 tests/array/arithmetic_ops/add_test.rs        |   4 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |   4 +-
 50 files changed, 1684 insertions(+), 2119 deletions(-)
 create mode 100644 src/scheduler/tokio.rs

diff --git a/Cargo.toml b/Cargo.toml
index 4401c70f..022c8fb1 100755
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,6 +53,7 @@ itertools = "0.10.5"
 serde_with = "3.0.0"
 pin-weak = "1.1.0"
 async-lock = "2.8.0"
+tokio = { version = "1.35.1", features = ["full"] , optional = true}
 
 
 [dev-dependencies]
@@ -74,13 +75,14 @@ members = ["impl"]
 #features are strictly additive.... can't have mutual exclusitivity
 [features]
 enable-rofi=["rofisys", "libc"]
+tokio-executor=["tokio"]
 slurm-test=[]
 default=[]
 
 
 [profile.release]
 opt-level=3
-lto=true
+lto=false
 codegen-units=1
 debug = true   
 
diff --git a/examples/active_message_examples/async_comparison.rs b/examples/active_message_examples/async_comparison.rs
index cd97397b..510e68cc 100644
--- a/examples/active_message_examples/async_comparison.rs
+++ b/examples/active_message_examples/async_comparison.rs
@@ -93,7 +93,9 @@ fn main() {
         for _i in 0..10 {
             std_am_group.add_am_all(std_am.clone()); //launch multiple tasks asyncronously
         }
-        world.block_on(std_am_group.exec());
+        world.block_on(async move {
+            std_am_group.exec().await;
+        });
         println!(
             "time for std sleep tasks: {:?}",
             timer.elapsed().as_secs_f64()
@@ -106,7 +108,9 @@ fn main() {
         for _i in 0..10 {
             async_am_group.add_am_all(async_am.clone()); //launch multiple tasks asyncronously
         }
-        world.block_on(async_am_group.exec());
+        world.block_on(async move {
+            async_am_group.exec().await;
+        });
         println!(
             "time for async sleep tasks: {:?}",
             timer.elapsed().as_secs_f64()
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index fceb7ec5..d162d171 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -20,7 +20,7 @@ fn initialize_mem_region(memregion: &LamellarMemoryRegion<usize>) {
 fn main() {
     let args: Vec<String> = std::env::args().collect();
     let world = lamellar::LamellarWorldBuilder::new().build();
-    world.block_on(async {
+    world.clone().block_on(async move {
         let _num_pes = world.num_pes();
         let my_pe = world.my_pe();
         let total_len = args
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 81c0420c..8b904396 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -9,7 +9,7 @@ fn main() {
     let array = GlobalLockArray::<usize>::new(&world, 100, Distribution::Block);
 
     let s = Instant::now();
-    let local_data = array.block_on(array.read_local_data());
+    let local_data = array.blocking_read_local_data();
     println!(
         "PE{my_pe} time: {:?} {:?}",
         s.elapsed().as_secs_f64(),
@@ -19,7 +19,7 @@ fn main() {
     drop(local_data); //release the lock
 
     world.barrier();
-    let mut local_data = array.block_on(array.write_local_data());
+    let mut local_data = array.blocking_write_local_data();
     println!(
         "PE{my_pe} time: {:?} got write lock",
         s.elapsed().as_secs_f64()
@@ -31,7 +31,7 @@ fn main() {
     array.print();
     println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
-    let mut local_data = array.block_on(array.collective_write_local_data());
+    let mut local_data = array.blocking_collective_write_local_data();
     println!(
         "PE{my_pe} time: {:?} got collective write lock",
         s.elapsed().as_secs_f64()
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index 7c185123..fa3f257e 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -66,9 +66,8 @@ fn main() {
         array.barrier();
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == 0 {
-            // let array_slice = unsafe { array.read_local_data() }; //unlike for unsafe arrays, accessing the local data captures a read lock, this would prevent any writes from happening.
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.block_on(array.read_local_data());
+                let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 3ebefdfb..18fa1078 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -66,9 +66,9 @@ fn main() {
         array.barrier();
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == num_pes - 1 {
-            // let array_slice = unsafe { array.read_local_data() }; //unlike for unsafe arrays, accessing the local data captures a read lock, this would prevent any writes from happening.
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.block_on(array.read_local_data());
+                let array_clone = array.clone();
+                let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/bandwidths/task_group_futures_am_bw.rs b/examples/bandwidths/task_group_futures_am_bw.rs
index 2dce157e..d07d0abc 100644
--- a/examples/bandwidths/task_group_futures_am_bw.rs
+++ b/examples/bandwidths/task_group_futures_am_bw.rs
@@ -64,7 +64,7 @@ fn main() {
                 cnt += 1;
             }
             println!("issue time: {:?}", timer.elapsed().as_secs_f64() - sub_time);
-            world.block_on(task_group.exec());
+            world.block_on(async move { task_group.exec().await });
         }
 
         world.barrier();
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index 6f7a981a..75bc18f3 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -61,10 +61,10 @@ fn main() {
 
     let global_darc = GlobalRwDarc::new(world.team(), 0).unwrap();
     println!("here 2");
-    let read_lock = world.block_on(global_darc.read());
+    let read_lock = global_darc.blocking_read();
     println!("I have the read lock!!!! {:?}", my_pe);
     drop(read_lock);
-    let write_lock = world.block_on(global_darc.write());
+    let write_lock = global_darc.blocking_write();
     println!("I have the write lock!!!! {:?}", my_pe);
     std::thread::sleep(std::time::Duration::from_secs(1));
     drop(write_lock);
@@ -112,7 +112,7 @@ fn main() {
             println!("here 8");
         } else {
             // println!("here");
-            *(*world.block_on(local_darc.write())) += 1;
+            *local_darc.blocking_write() += 1;
         }
     }
     // --------
diff --git a/examples/darc_examples/string_darc.rs b/examples/darc_examples/string_darc.rs
index 84cc74c2..37bf7cbb 100644
--- a/examples/darc_examples/string_darc.rs
+++ b/examples/darc_examples/string_darc.rs
@@ -19,19 +19,22 @@ impl LamellarAm for StringDarcAm {
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
-    let string_data = LocalRwDarc::new(&world, format!("Orig String on PE: {}", my_pe)).unwrap();
+    world.clone().block_on(async move {
+        let string_data =
+            LocalRwDarc::new(&world, format!("Orig String on PE: {}", my_pe)).unwrap();
 
-    println!("[PE: {}] {}", my_pe, world.block_on(string_data.read()));
+        println!("[PE: {}] {}", my_pe, string_data.read().await);
 
-    if my_pe == 0 {
-        world.block_on(world.exec_am_pe(
-            1,
-            StringDarcAm {
-                new_data: String::from("Modified string from 0"),
-                data: string_data.clone(),
-            },
-        ));
-    }
-    world.barrier();
-    println!("[PE: {}] {}", my_pe, world.block_on(string_data.read()));
+        if my_pe == 0 {
+            world.block_on(world.exec_am_pe(
+                1,
+                StringDarcAm {
+                    new_data: String::from("Modified string from 0"),
+                    data: string_data.clone(),
+                },
+            ));
+        }
+        world.barrier();
+        println!("[PE: {}] {}", my_pe, string_data.read().await);
+    });
 }
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index 0b2189ac..f0357a0a 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -203,6 +203,7 @@ fn dft_lamellar_am_group(
             );
         }
         let spec = spectrum.clone();
+        let world_clone = world.clone();
         pe_groups.push_back(async move {
             let res = local_sum_group.exec().await;
             let vec = (0..local_len)
@@ -214,7 +215,7 @@ fn dft_lamellar_am_group(
                     }
                 })
                 .collect::<Vec<_>>();
-            world
+            world_clone
                 .exec_am_pe(
                     pe,
                     RemoteSumAM {
@@ -225,7 +226,7 @@ fn dft_lamellar_am_group(
                 .await;
         });
     }
-    world.block_on(pe_groups.collect::<Vec<_>>());
+    world.block_on(async move { pe_groups.collect::<Vec<_>>().await });
 
     world.barrier();
     let time = timer.elapsed().as_secs_f64();
@@ -261,6 +262,7 @@ fn dft_lamellar_am_group_static(
             );
         }
         let spec = spectrum.clone();
+        let world_clone = world.clone();
         pe_groups.push_back(async move {
             let res = local_sum_group.exec().await;
             let vec = (0..local_len)
@@ -272,7 +274,7 @@ fn dft_lamellar_am_group_static(
                     }
                 })
                 .collect::<Vec<_>>();
-            world
+            world_clone
                 .exec_am_pe(
                     pe,
                     RemoteSumAM {
@@ -283,7 +285,9 @@ fn dft_lamellar_am_group_static(
                 .await;
         });
     }
-    world.block_on(pe_groups.collect::<Vec<_>>());
+    world.block_on(async move {
+        pe_groups.collect::<Vec<_>>().await;
+    });
 
     world.barrier();
     let time = timer.elapsed().as_secs_f64();
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index 2d976495..5a34398e 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -189,7 +189,7 @@ fn main() {
                         );
                     }
 
-                    let mut c_slice = c.block_on(c.write_local_data()); //this locks the array
+                    let mut c_slice = c.blocking_write_local_data(); //this locks the array
 
                     for row in 0..blocksize {
                         let row_offset = (i_blk * blocksize + row) * n;
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index e4f1ec92..afc8ab6c 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -638,7 +638,7 @@ use crate::lamellar_arch::IdError;
 use crate::lamellar_request::{InternalResult, LamellarRequestResult};
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::NetMemRegionHandle;
-use crate::scheduler::{ReqId, SchedulerQueue};
+use crate::scheduler::{Executor, LamellarExecutor, ReqId};
 // use log::trace;
 use async_trait::async_trait;
 use futures::Future;
@@ -856,9 +856,6 @@ pub(crate) enum Am {
     Return(ReqMetaData, LamellarArcAm), //req data, am to return and execute
     Data(ReqMetaData, LamellarResultArc), //req data, data to return
     Unit(ReqMetaData),                  //req data
-    _BatchedReturn(ReqMetaData, LamellarArcAm, ReqId), //req data, am to return and execute, batch id
-    _BatchedData(ReqMetaData, LamellarResultArc, ReqId), //req data, data to return, batch id
-    _BatchedUnit(ReqMetaData, ReqId),                  //req data, batch id
 }
 
 impl std::fmt::Debug for Am {
@@ -870,9 +867,6 @@ impl std::fmt::Debug for Am {
             Am::Return(_, _) => write!(f, "Return"),
             Am::Data(_, _) => write!(f, "Data"),
             Am::Unit(_) => write!(f, "Unit"),
-            Am::_BatchedReturn(_, _, _) => write!(f, "BatchedReturn"),
-            Am::_BatchedData(_, _, _) => write!(f, "BatchedData"),
-            Am::_BatchedUnit(_, _) => write!(f, "BatchedUnit"),
         }
     }
 }
@@ -1178,27 +1172,25 @@ pub trait ActiveMessaging {
     ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
     /// });
     ///```
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future;
+    fn block_on<F: Future>(&self, f: F) -> F::Output;
 }
 
 #[async_trait]
 pub(crate) trait ActiveMessageEngine {
     async fn process_msg(
-        &self,
+        self,
         am: Am,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
+        scheduler: Arc<Executor>,
         stall_mark: usize,
         immediate: bool,
     );
 
     async fn exec_msg(
-        &self,
+        self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
+        scheduler: Arc<Executor>,
     );
 
     fn get_team_and_world(
@@ -1232,39 +1224,39 @@ pub(crate) trait ActiveMessageEngine {
     }
 }
 
-#[derive(Debug)]
-pub(crate) enum ActiveMessageEngineType {
-    RegisteredActiveMessages(RegisteredActiveMessages),
-}
+// #[derive(Debug)]
+// pub(crate) enum ActiveMessageEngineType {
+//     RegisteredActiveMessages(RegisteredActiveMessages),
+// }
 
-#[async_trait]
-impl ActiveMessageEngine for ActiveMessageEngineType {
-    async fn process_msg(
-        &self,
-        am: Am,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        stall_mark: usize,
-        immediate: bool,
-    ) {
-        match self {
-            ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-                remote_am
-                    .process_msg(am, scheduler, stall_mark, immediate)
-                    .await;
-            }
-        }
-    }
-    async fn exec_msg(
-        &self,
-        msg: Msg,
-        ser_data: SerializedData,
-        lamellae: Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-    ) {
-        match self {
-            ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-                remote_am.exec_msg(msg, ser_data, lamellae, scheduler).await;
-            }
-        }
-    }
-}
+// #[async_trait]
+// impl ActiveMessageEngine for ActiveMessageEngineType {
+//     async fn process_msg(
+//         self,
+//         am: Am,
+//         executor: Arc<Executor>,
+//         stall_mark: usize,
+//         immediate: bool,
+//     ) {
+//         match self {
+//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
+//                 remote_am
+//                     .process_msg(am, executor, stall_mark, immediate)
+//                     .await;
+//             }
+//         }
+//     }
+//     async fn exec_msg(
+//         self,
+//         msg: Msg,
+//         ser_data: SerializedData,
+//         lamellae: Arc<Lamellae>,
+//         executor: Arc<Executor>,
+//     ) {
+//         match self {
+//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
+//                 remote_am.exec_msg(msg, ser_data, lamellae, executor).await;
+//             }
+//         }
+//     }
+// }
diff --git a/src/active_messaging/batching.rs b/src/active_messaging/batching.rs
index dd969ce5..6bbe638d 100644
--- a/src/active_messaging/batching.rs
+++ b/src/active_messaging/batching.rs
@@ -30,50 +30,41 @@ impl std::fmt::Debug for LamellarData {
 
 #[async_trait]
 pub(crate) trait Batcher {
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         stall_mark: usize,
     );
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         stall_mark: usize,
     );
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        stall_mark: usize,
-    );
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         stall_mark: usize,
     );
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, stall_mark: usize);
 
     async fn exec_batched_msg(
         &self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
-    );
+    ) -> Vec<Am>;
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) enum BatcherType {
     Simple(SimpleBatcher),
     TeamAm(TeamAmBatcher),
@@ -82,74 +73,78 @@ pub(crate) enum BatcherType {
 #[async_trait]
 impl Batcher for BatcherType {
     #[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_remote_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_remote_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
         }
     }
     #[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_return_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_return_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
         }
     }
     #[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark)
+                batcher
+                    .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark)
+                batcher
+                    .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                    .await
             }
         }
     }
     #[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        stall_mark: usize,
-    ) {
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, stall_mark: usize) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_unit_am_to_batch(req_data, scheduler, stall_mark)
+                batcher.add_unit_am_to_batch(req_data, stall_mark).await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_unit_am_to_batch(req_data, scheduler, stall_mark)
+                batcher.add_unit_am_to_batch(req_data, stall_mark).await
             }
         }
     }
@@ -159,19 +154,14 @@ impl Batcher for BatcherType {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
-    ) {
+    ) -> Vec<Am> {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, ame)
-                    .await;
+                batcher.exec_batched_msg(msg, ser_data, lamellae, ame).await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, ame)
-                    .await;
+                batcher.exec_batched_msg(msg, ser_data, lamellae, ame).await
             }
         }
     }
diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index a036580b..bfb099c7 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -52,7 +52,7 @@ impl SimpleBatcherInner {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct SimpleBatcher {
     batched_ams: Arc<Vec<SimpleBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
@@ -61,13 +61,12 @@ pub(crate) struct SimpleBatcher {
 #[async_trait]
 impl Batcher for SimpleBatcher {
     #[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         mut stall_mark: usize,
     ) {
         // println!("add_remote_am_to_batch");
@@ -93,37 +92,34 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_remote_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("remote size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_remote_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(self.create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         mut stall_mark: usize,
     ) {
         // println!("add_return_am_to_batch");
@@ -149,36 +145,33 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_rerturn_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("return size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_return_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(self.create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         mut stall_mark: usize,
     ) {
         // println!("add_data_am_to_batch");
@@ -207,36 +200,29 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_data_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("data size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_data_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(self.create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        mut stall_mark: usize,
-    ) {
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
         // println!("add_unit_am_to_batch");
         //let dst =req_data.dst;
         let batch = match req_data.dst {
@@ -256,26 +242,24 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_unit_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("unit size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_unit_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(self.create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
@@ -285,26 +269,29 @@ impl Batcher for SimpleBatcher {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
-    ) {
+    ) -> Vec<Am> {
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("executing batched msg {:?}", data.len());
-
+        let mut return_ams = Vec::new();
         while i < data.len() {
             let cmd: Cmd = crate::deserialize(&data[i..i + *CMD_LEN], false).unwrap();
             i += *CMD_LEN;
             // let temp_i = i;
             // println!("cmd {:?}", cmd);
             match cmd {
-                Cmd::Am => self.exec_am(&msg, data, &mut i, &lamellae, scheduler, ame),
-                Cmd::ReturnAm => self.exec_return_am(&msg, data, &mut i, &lamellae, scheduler, ame),
+                Cmd::Am => return_ams.push(self.exec_am(&msg, data, &mut i, &lamellae, ame).await),
+                Cmd::ReturnAm => {
+                    self.exec_return_am(&msg, data, &mut i, &lamellae, ame)
+                        .await
+                }
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => panic!("should not recieve a batched msg within a batched msg"),
             }
         }
+        return_ams
     }
 }
 
@@ -323,7 +310,7 @@ impl SimpleBatcher {
     }
 
     #[tracing::instrument(skip_all)]
-    async fn create_tx_task(&self, batch: SimpleBatcherInner) {
+    async fn create_tx_task(batch: SimpleBatcherInner) {
         // println!("[{:?}] create_tx_task", std::thread::current().id());
         let (buf, size) = batch.swap();
 
@@ -519,15 +506,14 @@ impl SimpleBatcher {
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_am(
+    async fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
-    ) {
+    ) -> Am {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -548,36 +534,34 @@ impl SimpleBatcher {
             team_addr: team.team.remote_ptr_addr,
         };
         // println!("[{:?}] exec_am submit task", std::thread::current().id());
-        scheduler.submit_task(async move {
-            let am = match am
-                .exec(
-                    team.team.world_pe,
-                    team.team.num_world_pes,
-                    false,
-                    world.clone(),
-                    team.clone(),
-                )
-                .await
-            {
-                LamellarReturn::Unit => Am::Unit(req_data),
-                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                    panic!("Should not be returning local data or AM from remote  am");
-                }
-            };
-            ame.process_msg(am, scheduler, 0, false).await;
-        });
+        let am = match am
+            .exec(
+                team.team.world_pe,
+                team.team.num_world_pes,
+                false,
+                world.clone(),
+                team.clone(),
+            )
+            .await
+        {
+            LamellarReturn::Unit => Am::Unit(req_data),
+            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                panic!("Should not be returning local data or AM from remote  am");
+            }
+        };
+        // ame.process_msg(am, 0, false).await;
+        am
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_return_am(
+    async fn exec_return_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
     ) {
         // println!("exec_return_am");
@@ -602,6 +586,8 @@ impl SimpleBatcher {
         //     "[{:?}] exec_return_am submit task",
         //     std::thread::current().id()
         // );
-        scheduler.submit_task(ame.exec_local_am(req_data, am.as_local(), world, team));
+        ame.clone()
+            .exec_local_am(req_data, am.as_local(), world, team)
+            .await;
     }
 }
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index eadfb3b2..60473bb7 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -167,7 +167,7 @@ impl TeamAmBatcherInner {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct TeamAmBatcher {
     batched_ams: Arc<Vec<TeamAmBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
@@ -176,13 +176,12 @@ pub(crate) struct TeamAmBatcher {
 #[async_trait]
 impl Batcher for TeamAmBatcher {
     #[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -198,49 +197,43 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("remote batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("remote size: {:?}",size);
-            scheduler.submit_immediate_task(self.create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -256,48 +249,43 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("return batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("return size: {:?}",size);
-            scheduler.submit_immediate_task(self.create_tx_task(
+
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -320,48 +308,38 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("data batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("data size: {:?}",size);
-            scheduler.submit_immediate_task(self.create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        mut stall_mark: usize,
-    ) {
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -375,38 +353,33 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("unit batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    self.create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("unit size: {:?}",size);
-            scheduler.submit_immediate_task(self.create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
@@ -416,9 +389,8 @@ impl Batcher for TeamAmBatcher {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
-    ) {
+    ) -> Vec<Am> {
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("i: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
@@ -435,10 +407,12 @@ impl Batcher for TeamAmBatcher {
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => {
-                    self.exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, scheduler, &ame);
+                    self.exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
+                        .await;
                 }
             }
         }
+        Vec::new()
     }
 }
 
@@ -457,7 +431,6 @@ impl TeamAmBatcher {
     }
     #[tracing::instrument(skip_all)]
     async fn create_tx_task(
-        &self,
         batch: TeamAmBatcherInner,
         lamellae: Arc<Lamellae>,
         arch: Arc<LamellarArchRT>,
@@ -700,16 +673,16 @@ impl TeamAmBatcher {
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_batched_am(
+    async fn exec_batched_am(
         &self,
         msg: &Msg,
         batch_cnt: usize,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
-    ) {
+    ) -> Vec<Am> {
+        let mut return_ams = Vec::new();
         for _team in 0..batch_cnt {
             let team_header: TeamHeader =
                 crate::deserialize(&data[*i..*i + *TEAM_HEADER_LEN], false).unwrap();
@@ -726,48 +699,50 @@ impl TeamAmBatcher {
                 for _am in 0..batched_am_header.am_cnt {
                     // println!("am cmd: {:?}", batched_am_header.cmd);
                     match batched_am_header.cmd {
-                        Cmd::Am => self.exec_am(
-                            msg,
-                            data,
-                            i,
-                            lamellae,
-                            scheduler,
-                            ame,
-                            batched_am_header.am_id,
-                            world.clone(),
-                            team.clone(),
-                        ),
-                        Cmd::ReturnAm => self.exec_return_am(
-                            msg,
-                            data,
-                            i,
-                            lamellae,
-                            scheduler,
-                            ame,
-                            batched_am_header.am_id,
-                            world.clone(),
-                            team.clone(),
+                        Cmd::Am => return_ams.push(
+                            self.exec_am(
+                                msg,
+                                data,
+                                i,
+                                lamellae,
+                                batched_am_header.am_id,
+                                world.clone(),
+                                team.clone(),
+                            )
+                            .await,
                         ),
+                        Cmd::ReturnAm => {
+                            self.exec_return_am(
+                                msg,
+                                data,
+                                i,
+                                lamellae,
+                                ame,
+                                batched_am_header.am_id,
+                                world.clone(),
+                                team.clone(),
+                            )
+                            .await
+                        }
                         _ => panic!("unhandled cmd"),
                     }
                 }
             }
         }
+        return_ams
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_am(
+    async fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        ame: &RegisteredActiveMessages,
         am_id: AmId,
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
-    ) {
+    ) -> Am {
         let req_id = crate::deserialize(&data[*i..*i + *REQ_ID_LEN], false).unwrap();
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
@@ -782,36 +757,35 @@ impl TeamAmBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        scheduler.submit_task(async move {
-            let am = match am
-                .exec(
-                    team.team.world_pe,
-                    team.team.num_world_pes,
-                    false,
-                    world.clone(),
-                    team.clone(),
-                )
-                .await
-            {
-                LamellarReturn::Unit => Am::Unit(req_data),
-                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                    panic!("Should not be returning local data or AM from remote  am");
-                }
-            };
-            ame.process_msg(am, scheduler, 0, false).await;
-        });
+
+        let am = match am
+            .exec(
+                team.team.world_pe,
+                team.team.num_world_pes,
+                false,
+                world.clone(),
+                team.clone(),
+            )
+            .await
+        {
+            LamellarReturn::Unit => Am::Unit(req_data),
+            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                panic!("Should not be returning local data or AM from remote  am");
+            }
+        };
+        am
+        // ame.process_msg(am, 0, false).await;
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_return_am(
+    async fn exec_return_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
         ame: &RegisteredActiveMessages,
         am_id: AmId,
         world: Arc<LamellarTeam>,
@@ -831,6 +805,9 @@ impl TeamAmBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        scheduler.submit_task(ame.exec_local_am(req_data, am.as_local(), world, team));
+
+        ame.clone()
+            .exec_local_am(req_data, am.as_local(), world, team)
+            .await;
     }
 }
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index 96044a44..aa049605 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -6,7 +6,6 @@ use crate::lamellae::{
     SerializedData, SubData,
 };
 
-use crate::scheduler::SchedulerQueue;
 use async_recursion::async_recursion;
 // use log::trace;
 use std::sync::Arc;
@@ -62,7 +61,7 @@ pub struct RegisteredAm {
 }
 crate::inventory::collect!(RegisteredAm);
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct RegisteredActiveMessages {
     batcher: BatcherType,
 }
@@ -100,9 +99,9 @@ pub(crate) struct UnitHeader {
 impl ActiveMessageEngine for RegisteredActiveMessages {
     #[tracing::instrument(skip_all)]
     async fn process_msg(
-        &self,
+        self,
         am: Am,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
+        executor: Arc<Executor>,
         stall_mark: usize,
         immediate: bool,
     ) {
@@ -118,14 +117,15 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 {
                     // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
                     if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                        self.batcher.add_remote_am_to_batch(
-                            req_data.clone(),
-                            am.clone(),
-                            am_id,
-                            am_size,
-                            scheduler,
-                            stall_mark,
-                        );
+                        self.batcher
+                            .add_remote_am_to_batch(
+                                req_data.clone(),
+                                am.clone(),
+                                am_id,
+                                am_size,
+                                stall_mark,
+                            )
+                            .await;
                     } else {
                         self.send_am(req_data.clone(), am.clone(), am_id, am_size, Cmd::Am)
                             .await;
@@ -148,9 +148,9 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                     let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                     let am_size = am.serialized_size();
                     if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                        self.batcher.add_remote_am_to_batch(
-                            req_data, am, am_id, am_size, scheduler, stall_mark,
-                        );
+                        self.batcher
+                            .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                            .await;
                     } else {
                         self.send_am(req_data, am, am_id, am_size, Cmd::Am).await;
                     }
@@ -166,9 +166,9 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                 let am_size = am.serialized_size();
                 if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                    self.batcher.add_return_am_to_batch(
-                        req_data, am, am_id, am_size, scheduler, stall_mark,
-                    );
+                    self.batcher
+                        .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                        .await;
                 } else {
                     self.send_am(req_data, am, am_id, am_size, Cmd::ReturnAm)
                         .await;
@@ -179,7 +179,8 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 let data_size = data.serialized_size();
                 if data_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
                     self.batcher
-                        .add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark);
+                        .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                        .await;
                 } else {
                     self.send_data_am(req_data, data, data_size).await;
                 }
@@ -187,60 +188,31 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
             Am::Unit(req_data) => {
                 if *UNIT_HEADER_LEN < crate::active_messaging::BATCH_AM_SIZE && !immediate {
                     self.batcher
-                        .add_unit_am_to_batch(req_data, scheduler, stall_mark);
+                        .add_unit_am_to_batch(req_data, stall_mark)
+                        .await;
                 } else {
                     self.send_unit_am(req_data).await;
                 }
             }
-            Am::_BatchedReturn(_req_data, _func, _batch_id) => {
-                // let func_id = *(AMS_IDS.get(&func.get_id()).unwrap());
-                // let func_size = func.serialized_size();
-                // if func_size <= crate::active_messaging::BATCH_AM_SIZE {
-                //     self.batcher
-                //         .add_batched_return_am_to_batch(
-                //             req_data, func, func_id, func_size, batch_id, scheduler,stall_mark
-                //         )
-                //         .await;
-                // } else {
-                //     self.send_batched_return_am(
-                //         req_data, func, func_id, func_size, batch_id, scheduler,
-                //     )
-                //     .await;
-                // }
-            }
-            Am::_BatchedData(_req_data, _data, _batch_id) => {
-                // let data_size = data.serialized_size();
-                // if data_size <= crate::active_messaging::BATCH_AM_SIZE {
-                //     self.add_batched_data_am_to_batch(
-                //         req_data, data, data_size, batch_id, scheduler,stall_mark
-                //     )
-                //     .await;
-                // } else {
-                //     self.send_batched_data_am(req_data, data, data_size, batch_id, scheduler)
-                //         .await;
-                // }
-            }
-            Am::_BatchedUnit(_req_data, _batch_id) => {
-                // self.add_batched_unit_am_to_batch(req_data, batch_id, scheduler,stall_mark)
-                //     .await;
-            }
         }
     }
 
     #[tracing::instrument(skip_all)]
     async fn exec_msg(
-        &self,
+        self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
+        executor: Arc<Executor>,
     ) {
         // println!("exec_msg");
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         match msg.cmd {
             Cmd::Am => {
-                self.exec_am(&msg, data, &mut i, &lamellae, scheduler).await;
+                let return_am = self.exec_am(&msg, data, &mut i, &lamellae).await;
+                let process_task = self.process_msg(return_am, executor.clone(), 0, false);
+                executor.submit_task(process_task);
             }
             Cmd::ReturnAm => {
                 self.exec_return_am(&msg, data, &mut i, &lamellae).await;
@@ -252,9 +224,15 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 self.exec_unit_am(&msg, data, &mut i).await;
             }
             Cmd::BatchedMsg => {
-                self.batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, self)
+                let ams = self
+                    .batcher
+                    .exec_batched_msg(msg, ser_data, lamellae, &self)
                     .await;
+                let am_tasks = futures::stream::FuturesUnordered::new();
+                for am in ams.into_iter() {
+                    am_tasks.push(self.clone().process_msg(am, executor.clone(), 0, false));
+                }
+                executor.submit_task(futures::future::join_all(am_tasks));
             }
         }
     }
@@ -451,8 +429,7 @@ impl RegisteredActiveMessages {
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-    ) {
+    ) -> Am {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -490,9 +467,8 @@ impl RegisteredActiveMessages {
                 panic!("Should not be returning local data or AM from remote  am");
             }
         };
-        self.process_msg(am, scheduler, 0, false).await; //0 just means we will force a stall_count loop
-                                                         // scheduler.submit_am(am);
-                                                         //TODO: compare against: scheduler.submit_am(ame, am).await;
+        am
+        // self.process_msg(am, 0, false).await; //0 just means we will force a stall_count loop
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/src/array.rs b/src/array.rs
index 9d99a839..5fe5eed4 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -395,7 +395,7 @@ impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
         LamellarArrayRdmaInput::LocalMemRegion(buf)
     }
 }
-impl<T:Dist> TeamFrom<&[T]> for LamellarArrayRdmaInput<T> {
+impl<T: Dist> TeamFrom<&[T]> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: &[T], team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
@@ -663,7 +663,7 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarWriteArra
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
             LamellarReadArray::AtomicArray(array) => array.reduce(reduction),
@@ -677,7 +677,7 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T>
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LamellarReadArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.sum() },
             LamellarReadArray::AtomicArray(array) => array.sum(),
@@ -686,7 +686,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
             LamellarReadArray::ReadOnlyArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.prod() },
             LamellarReadArray::AtomicArray(array) => array.prod(),
@@ -699,7 +699,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LamellarReadArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.max() },
             LamellarReadArray::AtomicArray(array) => array.max(),
@@ -708,7 +708,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
             LamellarReadArray::ReadOnlyArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.min() },
             LamellarReadArray::AtomicArray(array) => array.min(),
@@ -720,7 +720,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
             LamellarWriteArray::AtomicArray(array) => array.reduce(reduction),
@@ -732,7 +732,7 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LamellarWriteArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.sum() },
             LamellarWriteArray::AtomicArray(array) => array.sum(),
@@ -740,7 +740,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
             LamellarWriteArray::GlobalLockArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.prod() },
             LamellarWriteArray::AtomicArray(array) => array.prod(),
@@ -753,7 +753,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LamellarWriteArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.max() },
             LamellarWriteArray::AtomicArray(array) => array.max(),
@@ -761,7 +761,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
             LamellarWriteArray::GlobalLockArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.min() },
             LamellarWriteArray::AtomicArray(array) => array.min(),
@@ -991,9 +991,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> {
     /// let result = array.block_on(request); //block until am has executed
     /// // we also could have used world.block_on() or team.block_on()
     ///```
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future;
+    fn block_on<F: Future>(&self, f: F) -> F::Output;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Given a global index, calculate the PE and offset on that PE where the element actually resides.
@@ -1610,7 +1608,7 @@ where
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>>;
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// Interface for common arithmetic based reductions
@@ -1643,7 +1641,7 @@ where
     /// let sum = array.block_on(array.sum());
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
@@ -1668,7 +1666,7 @@ where
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// Interface for common compare based reductions
@@ -1696,7 +1694,7 @@ where
     /// let max = array.block_on(array.max());
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
@@ -1718,7 +1716,7 @@ where
     /// let min = array.block_on(array.min());
     /// assert_eq!(0,min);
     ///```
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// This procedural macro is used to enable the execution of user defined reductions on LamellarArrays.
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index a2cf10bd..646288ed 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1161,7 +1161,7 @@ impl<T: Dist> From<AtomicByteArray> for AtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
             AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
@@ -1172,13 +1172,13 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for AtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.sum(),
             AtomicArray::GenericAtomicArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.prod(),
             AtomicArray::GenericAtomicArray(array) => array.prod(),
@@ -1188,13 +1188,13 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for AtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.max(),
             AtomicArray::GenericAtomicArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.min(),
             AtomicArray::GenericAtomicArray(array) => array.min(),
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 2924f8d8..382059a4 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -712,10 +712,7 @@ impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -795,7 +792,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GenericAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GenericAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -804,20 +801,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GenericAtomicArray<T
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for GenericAtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for GenericAtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 9f766681..6b9ff9ef 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -25,7 +25,7 @@ use std::ops::{Deref, DerefMut};
 /// Direct RDMA operations can occur if the appropriate lock is held.
 #[lamellar_impl::AmDataRT(Clone, Debug)]
 pub struct GlobalLockArray<T> {
-    lock: GlobalRwDarc<()>,
+    pub(crate) lock: GlobalRwDarc<()>,
     pub(crate) array: UnsafeArray<T>,
 }
 
@@ -70,27 +70,26 @@ impl GlobalLockByteArrayWeak {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct GlobalLockMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
+pub struct GlobalLockMutLocalData<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
     _lock_guard: GlobalRwDarcWriteGuard<()>,
 }
 
-// impl<T: Dist> Drop for GlobalLockMutLocalData<'_, T>{
+// impl<T: Dist> Drop for GlobalLockMutLocalData<T>{
 //     fn drop(&mut self){
 //         println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for GlobalLockMutLocalData<'_, T> {
+impl<T: Dist> Deref for GlobalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for GlobalLockMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for GlobalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -103,27 +102,26 @@ impl<T: Dist> DerefMut for GlobalLockMutLocalData<'_, T> {
 ///
 /// When each PE drops its instance, the lock is release.
 #[derive(Debug)]
-pub struct GlobalLockCollectiveMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
+pub struct GlobalLockCollectiveMutLocalData<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
     _lock_guard: GlobalRwDarcCollectiveWriteGuard<()>,
 }
 
-// impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<'_, T>{
+// impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<T>{
 //     fn drop(&mut self){
 //         println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for GlobalLockCollectiveMutLocalData<'_, T> {
+impl<T: Dist> Deref for GlobalLockCollectiveMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -135,33 +133,29 @@ impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<'_, T> {
 /// (allowing for the safe deref into `&[T]`), preventing any local or remote write access.
 ///
 /// When the instance is dropped the lock is released.
-pub struct GlobalLockLocalData<'a, T: Dist> {
+pub struct GlobalLockLocalData<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
-    pub(crate) data: &'a [T],
-    index: usize,
     lock: GlobalRwDarc<()>,
     lock_guard: GlobalRwDarcReadGuard<()>,
 }
 
-impl<'a, T: Dist + std::fmt::Debug> std::fmt::Debug for GlobalLockLocalData<'a, T> {
+impl<T: Dist + std::fmt::Debug> std::fmt::Debug for GlobalLockLocalData<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.data)
+        write!(f, "{:?}", self.deref())
     }
 }
 
-impl<'a, T: Dist> Clone for GlobalLockLocalData<'a, T> {
+impl<T: Dist> Clone for GlobalLockLocalData<T> {
     fn clone(&self) -> Self {
         GlobalLockLocalData {
             array: self.array.clone(),
-            data: self.data,
-            index: self.index,
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-impl<'a, T: Dist> GlobalLockLocalData<'a, T> {
+impl<T: Dist> GlobalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
     /// # Examples
@@ -177,27 +171,30 @@ impl<'a, T: Dist> GlobalLockLocalData<'a, T> {
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
-    pub fn into_sub_data(self, start: usize, end: usize) -> GlobalLockLocalData<'a, T> {
+    pub fn into_sub_data(self, start: usize, end: usize) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
-            array: self.array.clone(),
-            data: &self.data[start..end],
-            index: 0,
+            array: self.array.sub_array(start..end),
             lock: self.lock,
             lock_guard: self.lock_guard,
         }
     }
 }
 
-impl<'a, T: Dist + serde::Serialize> serde::Serialize for GlobalLockLocalData<'a, T> {
+impl<T: Dist + serde::Serialize> serde::Serialize for GlobalLockLocalData<T> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
     {
-        self.data.serialize(serializer)
+        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
     }
 }
 
-impl<'a, T: Dist> Iterator for GlobalLockLocalData<'a, T> {
+pub struct GlobalLockLocalDataIter<'a, T: Dist> {
+    data: &'a [T],
+    index: usize,
+}
+
+impl<'a, T: Dist> Iterator for GlobalLockLocalDataIter<'a, T> {
     type Item = &'a T;
     fn next(&mut self) -> Option<Self::Item> {
         if self.index < self.data.len() {
@@ -209,11 +206,22 @@ impl<'a, T: Dist> Iterator for GlobalLockLocalData<'a, T> {
     }
 }
 
-impl<T: Dist> Deref for GlobalLockLocalData<'_, T> {
+impl<'a, T: Dist> IntoIterator for &'a GlobalLockLocalData<T> {
+    type Item = &'a T;
+    type IntoIter = GlobalLockLocalDataIter<'a, T>;
+    fn into_iter(self) -> Self::IntoIter {
+        GlobalLockLocalDataIter {
+            data: unsafe { self.array.array.local_as_mut_slice() },
+            index: 0,
+        }
+    }
+}
+
+impl<T: Dist> Deref for GlobalLockLocalData<T> {
     type Target = [T];
 
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -268,36 +276,37 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn read_local_data(&self) -> GlobalLockLocalData<'_, T> {
-    //     GlobalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: self.lock.read(),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
+    ///
+    /// Calling this function will result in a local read lock being captured on the array
+    ///
+    /// This function is blocking and intended to be called from non asynchronous contexts.
+    /// Calling within an asynchronous block may lead to deadlock.
+    ///
+    /// # One-sided Operation
+    /// Only returns local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_read_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_read_local_data(&self) -> GlobalLockLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            GlobalLockLocalData {
+                array: self_clone.clone(),
+                lock: self_clone.lock.clone(),
+                lock_guard: self_clone.lock.read().await,
+            }
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
@@ -314,52 +323,54 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.read_local_data());
+    /// let local_data = array.read_local_data().await;
     /// println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn read_local_data(&self) -> GlobalLockLocalData<'_, T> {
+    pub async fn read_local_data(&self) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
             array: self.clone(),
-            data: unsafe { self.array.local_as_mut_slice() },
-            index: 0,
             lock: self.lock.clone(),
             lock_guard: self.lock.read().await,
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the global write lock being captured on the array.
-    // ///.
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn write_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     let lock = self.lock.write();
-    //     let data = GlobalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: lock,
-    //     };
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     data
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
+    ///
+    /// Calling this function will result in the global write lock being captured on the array.
+    ///.
+    /// This function is blocking and intended to be called from non asynchronous contexts.
+    /// Calling within an asynchronous block may lead to deadlock.
+    ///
+    /// # One-sided Operation
+    /// Only returns (mutable) local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_write_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_write_local_data(&self) -> GlobalLockMutLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.write().await;
+            let data = GlobalLockMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
@@ -376,23 +387,23 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.write_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.write_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn write_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
+    pub async fn write_local_data(&self) -> GlobalLockMutLocalData<T> {
         let lock = self.lock.write().await;
         let data = GlobalLockMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    #[doc(alias("Collective"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
     ///
     /// Calling this function will result in the collective write lock being captured on the array
@@ -409,97 +420,54 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.collective_write_local_data());
+    /// let local_data = array.blocking_collective_write_local_data();
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
-    pub async fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<'_, T> {
+    pub fn blocking_collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.collective_write().await;
+            let data = GlobalLockCollectiveMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
+
+    #[doc(alias("Collective"))]
+    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
+    ///
+    /// Calling this function will result in the collective write lock being captured on the array
+    ///
+    /// # Collective Operation
+    /// All PEs associated with this array must enter the call, otherwise deadlock will occur.
+    /// Upon return every PE will hold a special collective write lock so that they can all access their local data simultaneous
+    /// This lock prevents any other access from occuring on the array until it is dropped on all the PEs.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
+    ///    let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    ///    let local_data = array.collective_write_local_data().await;
+    ///    println!("PE{my_pe} data: {local_data:?}");
+    /// });
+    ///```
+    pub async fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
         let lock = self.lock.collective_write().await;
         let data = GlobalLockCollectiveMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    // #[doc(hidden)] //todo create a custom macro to emit a warning saying use read_local_slice/write_local_slice intead
-    // pub(crate) async fn local_as_slice(&self) -> GlobalLockLocalData<'_, T> {
-    //     let the_lock = self.lock.read().await;
-    //     GlobalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: the_lock,
-    //     }
-    // }
-    // #[doc(hidden)]
-    // pub unsafe fn local_as_mut_slice(&self) -> &mut [T] {
-    //     self.array.local_as_mut_slice()
-    // }
-
-    // #[doc(hidden)]
-    // pub(crate) async fn local_as_mut_slice(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     let the_lock = self.lock.write().await;
-    //     let lock = GlobalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: the_lock,
-    //     };
-    //     // println!("have lla write lock");
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][GlobalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn local_data(&self) -> GlobalLockLocalData<'_, T> {
-    //     self.local_as_slice().await
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe immutable access to local elements.   
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][GlobalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.mut_local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn mut_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     self.local_as_mut_slice().await
-    // }
-
     #[doc(hidden)]
     pub unsafe fn __local_as_slice(&self) -> &[T] {
         self.array.local_as_mut_slice()
@@ -834,10 +802,7 @@ impl<T: Dist> LamellarArray<T> for GlobalLockArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -938,8 +903,9 @@ impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
-        let lock = self.array.block_on(self.lock.read());
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         Box::new(GlobalLockArrayReduceHandle {
             req: self.array.reduce_data(op, self.clone().into()),
             _lock_guard: lock,
@@ -950,20 +916,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for GlobalLockArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for GlobalLockArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 37a4c168..70c4db61 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -301,7 +301,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     type OnesidedIter = OneSidedIter<'static, T, Self>;
 
     fn dist_iter(&self) -> Self::DistIter {
-        let lock = self.array.block_on(self.lock.read());
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         self.barrier();
         GlobalLockDistIter {
             data: self.clone(),
@@ -313,7 +314,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock = self.array.block_on(self.lock.read());
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         GlobalLockLocalIter {
             data: self.clone(),
             lock: lock,
@@ -341,7 +343,11 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     type LocalIter = GlobalLockLocalIterMut<T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.collective_write()));
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(
+            self.array
+                .block_on(async move { lock.collective_write().await }),
+        );
         self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         GlobalLockDistIterMut {
@@ -354,7 +360,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter_mut(&self) -> Self::LocalIter {
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         GlobalLockLocalIterMut {
             data: self.clone(),
             lock: lock,
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 2d58ceca..76229c8f 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -4,7 +4,6 @@ use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::IterRequest;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
-use crate::scheduler::SchedulerQueue;
 use crate::Darc;
 
 use async_trait::async_trait;
@@ -72,7 +71,7 @@ impl LamellarAm for UpdateCntAm {
 }
 
 impl RemoteIterCountHandle {
-    async fn reduce_remote_counts(&self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
+    async fn reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
         self.team
             .exec_am_all(UpdateCntAm {
                 remote_cnt: local_cnt,
@@ -111,6 +110,7 @@ impl IterRequest for RemoteIterCountHandle {
             .sum::<usize>();
         self.team
             .scheduler
+            .clone()
             .block_on(self.reduce_remote_counts(count, cnt))
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index de7dede8..ea2ce0b8 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -72,7 +72,7 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     fn reduce_remote_vals(&self, local_val: Option<T>) -> Option<T> {
-        self.team.barrier();
+        self.team.tasking_barrier();
         let local_vals =
             UnsafeArray::<Option<T>>::new(&self.team, self.team.num_pes, Distribution::Block);
         unsafe {
diff --git a/src/array/iterator/one_sided_iterator/buffered.rs b/src/array/iterator/one_sided_iterator/buffered.rs
index 8a42178a..09650d96 100644
--- a/src/array/iterator/one_sided_iterator/buffered.rs
+++ b/src/array/iterator/one_sided_iterator/buffered.rs
@@ -1,7 +1,5 @@
 use crate::array::iterator::one_sided_iterator::*;
 use crate::array::LamellarArrayRequest;
-// use crate::LamellarArray;
-// use crate::scheduler::SchedulerQueue;
 use crate::memregion::OneSidedMemoryRegion;
 use std::collections::VecDeque;
 use std::ops::Deref;
@@ -91,8 +89,6 @@ impl<U> Deref for BufferedItem<U> {
     }
 }
 
-
-
 impl<I> OneSidedIterator for Buffered<I>
 where
     I: OneSidedIterator + Send,
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 9467692d..d897e922 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -73,27 +73,26 @@ impl LocalLockByteArrayWeak {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct LocalLockMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
-    _lock_guard: RwLockWriteGuardArc<Box<()>>,
+pub struct LocalLockMutLocalData<T: Dist> {
+    array: LocalLockArray<T>,
+    _lock_guard: RwLockWriteGuardArc<()>,
 }
 
-// impl<T: Dist> Drop for LocalLockMutLocalData<'_, T> {
+// impl<T: Dist> Drop for LocalLockMutLocalData<T> {
 //     fn drop(&mut self) {
 //         // println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for LocalLockMutLocalData<'_, T> {
+impl<T: Dist> Deref for LocalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for LocalLockMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for LocalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -106,28 +105,24 @@ impl<T: Dist> DerefMut for LocalLockMutLocalData<'_, T> {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct LocalLockLocalData<'a, T: Dist> {
+pub struct LocalLockLocalData<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
-    pub(crate) data: &'a [T],
-    index: usize,
     lock: LocalRwDarc<()>,
-    lock_guard: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock_guard: Arc<RwLockReadGuardArc<()>>,
 }
 
-impl<'a, T: Dist> Clone for LocalLockLocalData<'a, T> {
+impl<'a, T: Dist> Clone for LocalLockLocalData<T> {
     fn clone(&self) -> Self {
         // println!("getting read lock in LocalLockLocalData clone");
         LocalLockLocalData {
             array: self.array.clone(),
-            data: self.data,
-            index: self.index,
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-// impl<'a, T: Dist> Drop for LocalLockLocalData<'a, T> {
+// impl<'a, T: Dist> Drop for LocalLockLocalData<T> {
 //     fn drop(&mut self) {
 //         println!(
 //             "dropping read lock {:?}",
@@ -136,13 +131,13 @@ impl<'a, T: Dist> Clone for LocalLockLocalData<'a, T> {
 //     }
 // }
 
-// impl<'a, T: Dist> Drop for LocalLockMutLocalData<'a, T> {
+// impl<'a, T: Dist> Drop for LocalLockMutLocalData<T> {
 //     fn drop(&mut self) {
 //         println!("dropping write lock");
 //     }
 // }
 
-impl<'a, T: Dist> LocalLockLocalData<'a, T> {
+impl<'a, T: Dist> LocalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
     /// # Examples
@@ -158,27 +153,30 @@ impl<'a, T: Dist> LocalLockLocalData<'a, T> {
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
-    pub fn into_sub_data(self, start: usize, end: usize) -> LocalLockLocalData<'a, T> {
+    pub fn into_sub_data(self, start: usize, end: usize) -> LocalLockLocalData<T> {
         LocalLockLocalData {
-            array: self.array.clone(),
-            data: &self.data[start..end],
-            index: 0,
+            array: self.array.sub_array(start..end),
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<'a, T> {
+impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<T> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
     {
-        self.data.serialize(serializer)
+        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
     }
 }
 
-impl<'a, T: Dist> Iterator for LocalLockLocalData<'a, T> {
+pub struct LocalLockLocalDataIter<'a, T: Dist> {
+    data: &'a [T],
+    index: usize,
+}
+
+impl<'a, T: Dist> Iterator for LocalLockLocalDataIter<'a, T> {
     type Item = &'a T;
     fn next(&mut self) -> Option<Self::Item> {
         if self.index < self.data.len() {
@@ -190,11 +188,22 @@ impl<'a, T: Dist> Iterator for LocalLockLocalData<'a, T> {
     }
 }
 
-impl<T: Dist> Deref for LocalLockLocalData<'_, T> {
+impl<'a, T: Dist> IntoIterator for &'a LocalLockLocalData<T> {
+    type Item = &'a T;
+    type IntoIter = LocalLockLocalDataIter<'a, T>;
+    fn into_iter(self) -> Self::IntoIter {
+        LocalLockLocalDataIter {
+            data: unsafe { self.array.array.local_as_mut_slice() },
+            index: 0,
+        }
+    }
+}
+
+impl<T: Dist> Deref for LocalLockLocalData<T> {
     type Target = [T];
 
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -250,34 +259,35 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn read_local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     // println!("getting read lock in read_local_local");
-    //     LocalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: Arc::new(self.lock.read()),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
+    ///
+    /// Calling this function will result in a local read lock being captured on the array
+    ///
+    /// # One-sided Operation
+    /// Only returns local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_read_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_read_local_data(&self) -> LocalLockLocalData<T> {
+        // println!("getting read lock in read_local_local");
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            LocalLockLocalData {
+                array: self_clone.clone(),
+                lock: self_clone.lock.clone(),
+                lock_guard: Arc::new(self_clone.lock.read().await),
+            }
+        })
+    }
 
     /// TODO: UPDATE
     /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
@@ -292,51 +302,53 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.read_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.read_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn read_local_data(&self) -> LocalLockLocalData<'_, T> {
+    pub async fn read_local_data(&self) -> LocalLockLocalData<T> {
         // println!("getting read lock in read_local_local");
         LocalLockLocalData {
             array: self.clone(),
-            data: unsafe { self.array.local_as_mut_slice() },
-            index: 0,
             lock: self.lock.clone(),
             lock_guard: Arc::new(self.lock.read().await),
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the local write lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn write_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     // println!("getting write lock in write_local_data");
-    //     let lock = self.lock.write();
-    //     let data = LocalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: lock,
-    //     };
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     data
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
+    ///
+    /// Calling this function will result in the local write lock being captured on the array
+    ///
+    /// # One-sided Operation
+    /// Only returns (mutable) local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_write_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_write_local_data(&self) -> LocalLockMutLocalData<T> {
+        // println!("getting write lock in write_local_data");
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.write().await;
+            let data = LocalLockMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// TODO: UPDATE
@@ -352,150 +364,24 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.write_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.write_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn write_local_data(&self) -> LocalLockMutLocalData<'_, T> {
+    pub async fn write_local_data(&self) -> LocalLockMutLocalData<T> {
         // println!("getting write lock in write_local_data");
         let lock = self.lock.write().await;
         let data = LocalLockMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    // #[doc(hidden)] //todo create a custom macro to emit a warning saying use read_local_slice/write_local_slice intead
-    // pub(crate) async fn local_as_slice(&self) -> LocalLockLocalData<'_, T> {
-    //     // println!("getting read lock in local_as_slice");
-    //     let lock = LocalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: Arc::new(self.lock.read().await),
-    //     };
-    //     // println!("got read lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-    // #[doc(hidden)]
-    // pub unsafe fn local_as_mut_slice(&self) -> &mut [T] {
-    //     self.array.local_as_mut_slice()
-    // }
-
-    // #[doc(hidden)]
-    // pub(crate) async fn local_as_mut_slice(&self) -> LocalLockMutLocalData<'_, T> {
-    //     // println!("getting write lock in local_as_mut_slice");
-    //     let the_lock = self.lock.write().await;
-    //     let lock = LocalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: the_lock,
-    //     };
-    //     // println!("have lla write lock");
-    //     // println!("got write lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][LocalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     self.local_as_slice()
-    // }
-
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][LocalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     self.read_local_data().await
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][LocalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.mut_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn mut_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     self.local_as_mut_slice()
-    // }
-
-    // /// TODO: UPDATE
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][LocalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.mut_local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn mut_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     self.write_local_data().await
-    // }
-
     #[doc(hidden)]
     pub unsafe fn __local_as_slice(&self) -> &[T] {
         self.array.local_as_mut_slice()
@@ -830,10 +716,7 @@ impl<T: Dist> LamellarArray<T> for LocalLockArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -916,7 +799,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
 #[doc(hidden)]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
     req: Box<dyn LamellarRequest<Output = T>>,
-    _lock_guard: RwLockReadGuardArc<Box<()>>,
+    _lock_guard: RwLockReadGuardArc<()>,
 }
 
 #[async_trait]
@@ -931,8 +814,9 @@ impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
-        let lock = self.array.block_on(self.lock.read());
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         Box::new(LocalLockArrayReduceHandle {
             req: self.array.reduce_data(op, self.clone().into()),
             _lock_guard: lock,
@@ -943,20 +827,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LocalLockArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LocalLockArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 6de62980..a1d4479c 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -20,7 +20,7 @@ use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock: Arc<RwLockReadGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -42,7 +42,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIter<'a, T> {
 #[derive(Clone)]
 pub struct LocalLockLocalIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock: Arc<RwLockReadGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -158,7 +158,7 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalIter<'static, T>
 #[derive(Clone)]
 pub struct LocalLockDistIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<Box<()>>>,
+    lock: Arc<RwLockWriteGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -179,7 +179,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIterMut<'a, T> {
 #[derive(Clone)]
 pub struct LocalLockLocalIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<Box<()>>>,
+    lock: Arc<RwLockWriteGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -305,7 +305,9 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     type OnesidedIter = OneSidedIter<'static, T, Self>;
 
     fn dist_iter(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.read()));
+        // let the_array: LocalLockArray<T> = self.clone();
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
         self.barrier();
         LocalLockDistIter {
             data: self.clone(),
@@ -317,7 +319,8 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock = Arc::new(self.array.block_on(self.lock.read()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
         LocalLockLocalIter {
             data: self.clone(),
             lock: lock,
@@ -345,7 +348,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     type LocalIter = LocalLockLocalIterMut<'static, T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         LocalLockDistIterMut {
@@ -359,7 +363,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
 
     fn local_iter_mut(&self) -> Self::LocalIter {
         // println!("trying to get write lock for iter");
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         // println!("got write lock for iter");
         LocalLockLocalIterMut {
             data: self.clone(),
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 7e0e046b..590f9b48 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1134,10 +1134,7 @@ impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -1207,7 +1204,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for NativeAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for NativeAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -1216,20 +1213,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for NativeAtomicArray<T>
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for NativeAtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for NativeAtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/operations.rs b/src/array/operations.rs
index 619b4a51..e064bd73 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -6,7 +6,7 @@ use crate::array::local_lock_atomic::*;
 use crate::array::native_atomic::*;
 use crate::array::{AmDist, Dist, LamellarArrayRequest, LamellarEnv, LamellarWriteArray};
 use crate::lamellar_request::LamellarRequest;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use crate::LamellarTeamRT;
 
 pub(crate) mod access;
@@ -230,8 +230,8 @@ pub enum OpInputEnum<'a, T: Dist> {
     Vec(Vec<T>),
     NativeAtomicLocalData(NativeAtomicLocalData<T>),
     GenericAtomicLocalData(GenericAtomicLocalData<T>),
-    LocalLockLocalData(LocalLockLocalData<'a, T>),
-    GlobalLockLocalData(GlobalLockLocalData<'a, T>),
+    LocalLockLocalData(LocalLockLocalData<T>),
+    GlobalLockLocalData(GlobalLockLocalData<T>),
     // Iter(Box<dyn Iterator<Item = T> + 'a>),
 
     // while it would be convienient to directly use the following, doing so
@@ -244,7 +244,7 @@ pub enum OpInputEnum<'a, T: Dist> {
     // AtomicArray(AtomicArray<T>),
 }
 
-impl<'a, T: Dist> OpInputEnum<'_, T> {
+impl<'a, T: Dist> OpInputEnum<'a, T> {
     #[tracing::instrument(skip_all)]
     pub(crate) fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
         match self {
@@ -303,15 +303,15 @@ impl<'a, T: Dist> OpInputEnum<'_, T> {
     }
 
     // #[tracing::instrument(skip_all)]
-    pub(crate) fn as_vec_chunks(&self, chunk_size: usize) -> Box<dyn Iterator<Item = Vec<T>> + '_> {
+    pub(crate) fn into_vec_chunks(self, chunk_size: usize) -> Vec<Vec<T>> {
         match self {
-            OpInputEnum::Val(v) => Box::new(vec![vec![*v]].into_iter()),
-            OpInputEnum::Slice(s) => Box::new(s.chunks(chunk_size).map(|chunk| chunk.to_vec())),
-            OpInputEnum::Vec(v) => Box::new(v.chunks(chunk_size).map(|chunk| chunk.to_vec())),
+            OpInputEnum::Val(v) =>vec![vec![v]],
+            OpInputEnum::Slice(s) => s.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
+            OpInputEnum::Vec(v) => v.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
             OpInputEnum::NativeAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                Box::new(a.iter().enumerate().filter_map(move |(i, elem)| {
+                a.iter().enumerate().filter_map(move |(i, elem)| {
                     data.push(elem.load());
                     if data.len() == chunk_size || i == a.len() - 1 {
                         let mut new_data = Vec::with_capacity(chunk_size);
@@ -320,12 +320,12 @@ impl<'a, T: Dist> OpInputEnum<'_, T> {
                     } else {
                         None
                     }
-                }))
+                }).collect()
             }
             OpInputEnum::GenericAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                Box::new(a.iter().enumerate().filter_map(move |(i, elem)| {
+                a.iter().enumerate().filter_map(move |(i, elem)| {
                     data.push(elem.load());
                     if data.len() == chunk_size || i == a.len() - 1 {
                         let mut new_data = Vec::with_capacity(chunk_size);
@@ -334,13 +334,13 @@ impl<'a, T: Dist> OpInputEnum<'_, T> {
                     } else {
                         None
                     }
-                }))
+                }).collect()
             }
             OpInputEnum::LocalLockLocalData(a) => {
-                Box::new(a.data.chunks(chunk_size).map(|chunk| chunk.to_vec()))
+                a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
             }
             OpInputEnum::GlobalLockLocalData(a) => {
-                Box::new(a.data.chunks(chunk_size).map(|chunk| chunk.to_vec()))
+                a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
             }
             // OpInputEnum::MemoryRegion(mr) => *unsafe { mr.as_slice() }
             //     .expect("memregion not local")
@@ -682,7 +682,7 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
 //     }
 // }
 
-impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<'_, T> {
+impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
     #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
         let len = self.len();
@@ -722,7 +722,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<'_, T> {
     }
 }
 
-impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<'_, T> {
+impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
     #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
         let len = self.len();
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 430e8882..942c2fad 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -467,7 +467,7 @@ impl<T: Dist> From<LamellarByteArray> for ReadOnlyArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for ReadOnlyArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -476,20 +476,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for ReadOnlyArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for ReadOnlyArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for ReadOnlyArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
@@ -550,10 +550,7 @@ impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 11ccecf1..de48d4f5 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -11,7 +11,6 @@ use crate::darc::{Darc, DarcMode, WeakDarc};
 use crate::lamellae::AllocationType;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::{Dist, MemoryRegion};
-use crate::scheduler::SchedulerQueue;
 use crate::LamellarTaskGroup;
 use core::marker::PhantomData;
 use std::ops::Bound;
@@ -366,9 +365,10 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.wait_all();
         // println!("block on outstanding");
         // self.inner.data.print();
+        // let the_array: UnsafeArray<T> = self.clone();
+        let array_darc = self.inner.data.clone();
         self.team_rt()
-            .block_on(self.inner.data.block_on_outstanding(mode, 0));
-        // self.inner.data.print();
+            .block_on(array_darc.block_on_outstanding(mode, 1)); //one for this instance of the array
     }
 
     #[doc(alias = "Collective")]
@@ -808,10 +808,7 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.inner.data.team.scheduler.block_on(f)
     }
 
@@ -996,7 +993,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
     ///```
-    pub unsafe fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce_data(op, self.clone().into()).into_future()
     }
 
@@ -1032,7 +1029,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(unsafe{array.sum()}); //Safe in this instance as we have ensured no updates are currently happening
     /// // assert_eq!(array.len()*num_pes,sum);//this may or may not fail
     ///```
-    pub unsafe fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
 
@@ -1069,7 +1066,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    pub unsafe fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 
@@ -1100,7 +1097,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let max = array.block_on(max_req);
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    pub unsafe fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
 
@@ -1131,7 +1128,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let min = array.block_on(min_req);
     /// assert_eq!(0,min);
     ///```
-    pub unsafe fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 6940f7b2..3a614c7c 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -2,7 +2,6 @@ use crate::active_messaging::LamellarArcAm;
 use crate::array::operations::*;
 use crate::array::r#unsafe::UnsafeArray;
 use crate::array::{AmDist, Dist, LamellarArray, LamellarByteArray, LamellarEnv};
-use crate::scheduler::SchedulerQueue;
 use futures::Future;
 use parking_lot::Mutex;
 use std::any::TypeId;
@@ -394,12 +393,13 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
             let index_vec = index.to_vec();
+            let the_array: UnsafeArray<T> = self.clone();
             // println!("num_reqs {:?}",num_reqs);
             self.inner
                 .data
                 .team
                 .scheduler
-                .submit_immediate_task2(async move {
+                .submit_immediate_task(async move {
                     let mut buffs =
                         vec![Vec::with_capacity(num_per_batch * index_size.len()); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
@@ -408,11 +408,12 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     // let mut res_index = 0;
                     for (ii, idx) in index_vec.iter().enumerate() {
                         let j = ii + start_i;
-                        let (pe, local_index) = match self.pe_and_offset_for_global_index(*idx) {
+                        let (pe, local_index) = match the_array.pe_and_offset_for_global_index(*idx)
+                        {
                             Some((pe, local_index)) => (pe, local_index),
                             None => panic!(
                                 "Index: {idx} out of bounds for array of len: {:?}",
-                                self.inner.size
+                                the_array.inner.size
                             ),
                         };
                         buffs[pe].extend_from_slice(index_size.as_bytes(&local_index));
@@ -432,14 +433,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = self
+                            let req = the_array
                                 .inner
                                 .data
                                 .team
                                 .exec_arc_am_pe::<R>(
                                     pe,
                                     am,
-                                    Some(self.inner.data.array_counters.clone()),
+                                    Some(the_array.inner.data.array_counters.clone()),
                                 )
                                 .into_future();
                             reqs.push(Box::pin(async move { (req.await, new_res_buffer) }));
@@ -457,14 +458,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = self
+                            let req = the_array
                                 .inner
                                 .data
                                 .team
                                 .exec_arc_am_pe::<R>(
                                     pe,
                                     am,
-                                    Some(self.inner.data.array_counters.clone()),
+                                    Some(the_array.inner.data.array_counters.clone()),
                                 )
                                 .into_future();
                             reqs.push(Box::pin(async move { (req.await, res_buff) }));
@@ -473,22 +474,23 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     // println!("reqs len {:?}",reqs.len());
                     futures2.lock().extend(reqs);
                     cnt2.fetch_add(1, Ordering::SeqCst);
-                    self.inner
+                    the_array
+                        .inner
                         .data
                         .array_counters
                         .outstanding_reqs
                         .fetch_sub(1, Ordering::SeqCst);
-                    self.inner.data.team.dec_counters(1);
+                    the_array.inner.data.team.dec_counters(1);
                 });
             start_i += len;
         }
 
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
+        while cnt.load(Ordering::SeqCst) < num_reqs {
+            self.inner.data.team.scheduler.exec_task();
+        }
         // println!("futures len {:?}",futures.lock().len());
         Box::pin(async move {
-            while cnt.load(Ordering::SeqCst) < num_reqs {
-                // self.inner.data.team.scheduler.exec_task();
-                async_std::task::yield_now().await;
-            }
             // println!("futures len {:?}",futures.lock().len());
             futures::future::join_all(futures.lock().drain(..)).await
         })
@@ -523,6 +525,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         let num_reqs = vals.len();
         // println!("num_reqs {:?}",num_reqs);
         let mut start_i = 0;
+        let scheduler = self.inner.data.team.scheduler.clone();
         for val in vals.drain(..) {
             let cnt2 = cnt.clone();
             let futures2 = futures.clone();
@@ -530,58 +533,54 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let len = val.len();
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
-            let val_chunks = val.as_vec_chunks(num_per_batch);
-            self.inner
-                .data
-                .team
-                .scheduler
-                .submit_immediate_task2(async move {
-                    // let mut buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
-                    // let val_slice = val.as_slice();
-                    let mut inner_start_i = start_i;
-                    let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
-                        Vec::new();
-                    // val.as_vec_chunks(num_per_batch)
-                    val_chunks.into_iter().for_each(|val| {
-                        let val_len = val.len();
-                        let am = MultiValSingleIndex::new_with_vec(
-                            byte_array2.clone(),
-                            op,
-                            local_index,
-                            val,
-                        )
-                        .into_am::<T>(ret);
-                        let req = self
-                            .inner
-                            .data
-                            .team
-                            .exec_arc_am_pe::<R>(
-                                pe,
-                                am,
-                                Some(self.inner.data.array_counters.clone()),
-                            )
-                            .into_future();
-                        // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
-                        let res_buffer =
-                            (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
-                        reqs.push(Box::pin(async move { (req.await, res_buffer) }));
-                        inner_start_i += val_len;
-                    });
-                    // println!("reqs len {:?}",reqs.len());
-                    futures2.lock().extend(reqs);
-                    cnt2.fetch_add(1, Ordering::SeqCst);
-                    self.inner
+            let the_array: UnsafeArray<T> = self.clone();
+            let val_chunks = val.into_vec_chunks(num_per_batch);
+            scheduler.submit_immediate_task(async move {
+                let mut inner_start_i = start_i;
+                let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
+                    Vec::new();
+                val_chunks.into_iter().for_each(|val| {
+                    let val_len = val.len();
+                    let am = MultiValSingleIndex::new_with_vec(
+                        byte_array2.clone(),
+                        op,
+                        local_index,
+                        val,
+                    )
+                    .into_am::<T>(ret);
+                    let req = the_array
+                        .inner
                         .data
-                        .array_counters
-                        .outstanding_reqs
-                        .fetch_sub(1, Ordering::SeqCst);
-                    self.inner.data.team.dec_counters(1);
+                        .team
+                        .exec_arc_am_pe::<R>(
+                            pe,
+                            am,
+                            Some(the_array.inner.data.array_counters.clone()),
+                        )
+                        .into_future();
+                    // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
+                    let res_buffer =
+                        (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
+                    reqs.push(Box::pin(async move { (req.await, res_buffer) }));
+                    inner_start_i += val_len;
                 });
+                // println!("reqs len {:?}",reqs.len());
+                futures2.lock().extend(reqs);
+                cnt2.fetch_add(1, Ordering::SeqCst);
+                the_array
+                    .inner
+                    .data
+                    .array_counters
+                    .outstanding_reqs
+                    .fetch_sub(1, Ordering::SeqCst);
+                the_array.inner.data.team.dec_counters(1);
+            });
             start_i += len;
         }
+
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-            // async_std::task::yield_now().await;
         }
         // println!("futures len {:?}",futures.lock().len());
         Box::pin(async move {
@@ -628,12 +627,13 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             self.inner.data.team.inc_counters(1);
             let index_vec = index.to_vec();
             let vals_vec = val.to_vec();
+            let the_array: UnsafeArray<T> = self.clone();
             // println!("trying to submit immediate task");
             self.inner
                 .data
                 .team
                 .scheduler
-                .submit_immediate_task2(async move {
+                .submit_immediate_task(async move {
                     // println!("in immediate task");
                     let mut buffs = vec![Vec::with_capacity(bytes_per_batch); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
@@ -644,11 +644,12 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                         index_vec.into_iter().zip(vals_vec.into_iter()).enumerate()
                     {
                         let j = ii + start_i;
-                        let (pe, local_index) = match self.pe_and_offset_for_global_index(idx) {
+                        let (pe, local_index) = match the_array.pe_and_offset_for_global_index(idx)
+                        {
                             Some((pe, local_index)) => (pe, local_index),
                             None => panic!(
                                 "Index: {idx} out of bounds for array of len: {:?}",
-                                self.inner.size
+                                the_array.inner.size
                             ),
                         };
                         match index_size {
@@ -703,14 +704,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = self
+                            let req = the_array
                                 .inner
                                 .data
                                 .team
                                 .exec_arc_am_pe::<R>(
                                     pe,
                                     am,
-                                    Some(self.inner.data.array_counters.clone()),
+                                    Some(the_array.inner.data.array_counters.clone()),
                                 )
                                 .into_future();
                             reqs.push(Box::pin(async move { (req.await, new_res_buffer) }));
@@ -728,14 +729,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = self
+                            let req = the_array
                                 .inner
                                 .data
                                 .team
                                 .exec_arc_am_pe::<R>(
                                     pe,
                                     am,
-                                    Some(self.inner.data.array_counters.clone()),
+                                    Some(the_array.inner.data.array_counters.clone()),
                                 )
                                 .into_future();
                             reqs.push(Box::pin(async move { (req.await, res_buff) }));
@@ -743,18 +744,19 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     }
                     futures2.lock().extend(reqs);
                     cnt2.fetch_add(1, Ordering::SeqCst);
-                    self.inner
+                    the_array
+                        .inner
                         .data
                         .array_counters
                         .outstanding_reqs
                         .fetch_sub(1, Ordering::SeqCst);
-                    self.inner.data.team.dec_counters(1);
+                    the_array.inner.data.team.dec_counters(1);
                 });
             start_i += len;
         }
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-            // async_std::task::yield_now().await;
         }
         // println!("futures len {:?}", futures.lock().len());
         Box::pin(async move {
diff --git a/src/barrier.rs b/src/barrier.rs
index 185ad304..1ee005fc 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -1,10 +1,7 @@
 use crate::lamellae::{AllocationType, Lamellae, LamellaeRDMA};
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::SchedulerQueue;
-// use crate::lamellar_memregion::{SharedMemoryRegion,RegisteredMemoryRegion};
-use crate::memregion::MemoryRegion; //, RTMemoryRegionRDMA, RegisteredMemoryRegion};
+use crate::memregion::MemoryRegion;
 use crate::scheduler::Scheduler;
-// use rand::prelude::SliceRandom;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::time::Instant;
@@ -17,7 +14,7 @@ pub(crate) struct Barrier {
     n: usize, // dissemination factor
     num_rounds: usize,
     pub(crate) arch: Arc<LamellarArchRT>,
-    pub(crate) _scheduler: Arc<Scheduler>,
+    pub(crate) scheduler: Arc<Scheduler>,
     lamellae: Arc<Lamellae>,
     barrier_cnt: AtomicUsize,
     barrier_buf: Vec<MemoryRegion<usize>>,
@@ -85,17 +82,17 @@ impl Barrier {
         };
 
         let bar = Barrier {
-            my_pe: my_pe,
-            num_pes: num_pes,
-            n: n,
-            num_rounds: num_rounds,
-            arch: arch,
-            _scheduler: scheduler,
-            lamellae: lamellae,
+            my_pe,
+            num_pes,
+            n,
+            num_rounds,
+            arch,
+            scheduler,
+            lamellae,
             barrier_cnt: AtomicUsize::new(1),
             barrier_buf: buffs,
-            send_buf: send_buf,
-            panic: panic,
+            send_buf,
+            panic,
         };
         // bar.print_bar();
         bar
@@ -274,7 +271,7 @@ impl Barrier {
         if std::thread::current().id() == *crate::MAIN_THREAD {
             self.barrier_internal(|| {
                 // std::thread::yield_now();
-                self._scheduler.exec_task();
+                self.scheduler.exec_task();
             });
         } else {
             if let Ok(val) = std::env::var("LAMELLAR_BARRIER_WARNING") {
@@ -293,7 +290,7 @@ impl Barrier {
     // we actually want to be able to process other tasks while the barrier is active
     pub(crate) fn tasking_barrier(&self) {
         self.barrier_internal(|| {
-            self._scheduler.exec_task();
+            self.scheduler.exec_task();
         });
     }
 
diff --git a/src/darc.rs b/src/darc.rs
index 90a06b97..b2d57d70 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -64,7 +64,6 @@ use crate::barrier::Barrier;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
-// use crate::scheduler::SchedulerQueue;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 #[doc(hidden)]
@@ -137,8 +136,8 @@ pub struct DarcInner<T> {
     drop: Option<fn(&mut T)>,
     valid: AtomicBool,
 }
-unsafe impl<T: Send> Send for DarcInner<T> {}
-unsafe impl<T: Sync> Sync for DarcInner<T> {}
+unsafe impl<T> Send for DarcInner<T> {} //we cant create DarcInners without going through the Darc interface which enforces  Sync+Send
+unsafe impl<T> Sync for DarcInner<T> {} //we cant create DarcInners without going through the Darc interface which enforces  Sync+Send
 
 /// Distributed atomic reference counter
 ///
@@ -192,8 +191,8 @@ pub struct Darc<T: 'static> {
     inner: *mut DarcInner<T>,
     src_pe: usize,
 }
-unsafe impl<T: Send> Send for Darc<T> {}
-unsafe impl<T: Sync> Sync for Darc<T> {}
+unsafe impl<T: Sync + Send> Send for Darc<T> {}
+unsafe impl<T: Sync + Send> Sync for Darc<T> {}
 
 impl<T> LamellarEnv for Darc<T> {
     fn my_pe(&self) -> usize {
@@ -938,15 +937,11 @@ impl<T> Darc<T> {
         Ok(d)
     }
 
-    pub(crate) async fn block_on_outstanding(&self, state: DarcMode, extra_cnt: usize) {
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
-            },
-            state,
-            extra_cnt,
-        )
-        .await;
+    pub(crate) async fn block_on_outstanding(self, state: DarcMode, extra_cnt: usize) {
+        let wrapped = WrappedInner {
+            inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
+        };
+        DarcInner::block_on_outstanding(wrapped, state, extra_cnt).await;
     }
 
     #[doc(alias = "Collective")]
@@ -982,9 +977,10 @@ impl<T> Darc<T> {
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
         inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
         // println! {"[{:?}] darc[{:?}] into_localrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
-        let item = unsafe { Box::from_raw(inner.item as *mut T) };
+        let item = unsafe { *Box::from_raw(inner.item as *mut T) };
+
         let d = Darc {
-            inner: self.inner as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+            inner: self.inner as *mut DarcInner<Arc<RwLock<T>>>,
             src_pe: self.src_pe,
         };
         d.inner_mut()
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 2bda5a9b..cbb5cbaa 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -423,8 +423,8 @@ pub struct GlobalRwDarc<T: 'static> {
     pub(crate) darc: Darc<DistRwLock<T>>,
 }
 
-unsafe impl<T: Send> Send for GlobalRwDarc<T> {}
-unsafe impl<T: Sync> Sync for GlobalRwDarc<T> {}
+unsafe impl<T: Send> Send for GlobalRwDarc<T> {} //protected internally by rwlock
+unsafe impl<T: Send> Sync for GlobalRwDarc<T> {} //protected internally by rwlock
 
 impl<T> LamellarEnv for GlobalRwDarc<T> {
     fn my_pe(&self) -> usize {
@@ -547,13 +547,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let guard = world.block_on(counter.read());
-    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    /// drop(guard); //release the
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let guard = counter.read().await;
+    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///     drop(guard); //release the
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn read(&self) -> GlobalRwDarcReadGuard<T> {
         // println!("async read");
@@ -619,13 +621,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.write());
-    /// *guard += my_pe;
-    /// drop(guard); //release the
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.write().await;
+    ///     *guard += my_pe;
+    ///     drop(guard); //release the
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn write(&self) -> GlobalRwDarcWriteGuard<T> {
         // println!("async write");
@@ -688,13 +692,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.collective_write());
-    /// *guard += my_pe;
-    /// drop(guard); //release the lock
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.collective_write().await;
+    ///     *guard += my_pe;
+    ///     drop(guard); //release the lock
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
         // println!("async write");
@@ -723,182 +729,183 @@ impl<T> GlobalRwDarc<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while any writer currently has access to the lock, but there may be other readers
-    // ///
-    // /// Returns ared this specific instance of the read lock will only be held by the calling PE (until it is dropped)
-    // /// Other PEs may have separately aquired read locks as well.
-    // ///
-    // ///
-    // /// # Noten RAII guard which will drop the read access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    // /// Once aqui
-    // /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_read]
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// // do interesting work
-    // /// let guard = counter.read(); //blocks current thread until aquired
-    // /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    // ///```
-    // pub fn read(&self) -> GlobalRwDarcReadGuard<T> {
-    //     // println!("read");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::Read,
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcReadGuard {
-    //         rwlock: self.darc.clone(),
-    //         marker: PhantomData,
-    //         local_cnt: Arc::new(AtomicUsize::new(1)),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while any writer currently has access to the lock, but there may be other readers
+    ///
+    /// Returns ared this specific instance of the read lock will only be held by the calling PE (until it is dropped)
+    /// Other PEs may have separately aquired read locks as well.
+    ///
+    ///
+    /// # Noten RAII guard which will drop the read access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
+    /// Once aqui
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_read]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// // do interesting work
+    /// let guard = counter.blocking_read(); //blocks current thread until aquired
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///```
+    pub fn blocking_read(&self) -> GlobalRwDarcReadGuard<T> {
+        // println!("read");
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Launches an active message to gather a global write lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    // /// Once aquired the lock will only be held by the calling PE (until it is dropped)
-    // ///
-    // /// # Note
-    // /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_write]
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// // do interesting work
-    // /// let mut guard = counter.write(); //blocks current thread until aquired
-    // /// *guard += my_pe;
-    // ///```
-    // pub fn write(&self) -> GlobalRwDarcWriteGuard<T> {
-    //     // println!("write");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::Write,
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcWriteGuard {
-    //         rwlock: self.darc.clone(),
-    //         marker: PhantomData,
-    //     }
-    //     // inner.item().write(remote_rwlock_addr)
-    // }
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::Read,
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcReadGuard {
+            rwlock: self.darc.clone(),
+            marker: PhantomData,
+            local_cnt: Arc::new(AtomicUsize::new(1)),
+        }
+    }
 
-    // #[doc(alias("Collective"))]
-    // /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current task will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # Collective Operation
-    // /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::*;
-    // ///
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let mut counter = self.counter.async_write().await; // await until we get the write lock
-    // ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let mut guard = world.block_on(counter.collective_write());
-    // /// *guard += my_pe;
-    // /// drop(guard); //release the lock
-    // /// world.wait_all(); // wait for my active message to return
-    // /// world.barrier(); //at this point all updates will have been performed
-    // ///```
-    // pub fn collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-    //     // println!("async write");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::CollectiveWrite(collective_cnt),
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcCollectiveWriteGuard {
-    //         rwlock: self.darc.clone(),
-    //         collective_cnt: collective_cnt,
-    //         marker: PhantomData,
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Launches an active message to gather a global write lock associated with this GlobalRwDarc.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
+    /// Once aquired the lock will only be held by the calling PE (until it is dropped)
+    ///
+    /// # Note
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_write]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// // do interesting work
+    /// let mut guard = counter.blocking_write(); //blocks current thread until aquired
+    /// *guard += my_pe;
+    ///```
+    pub fn blocking_write(&self) -> GlobalRwDarcWriteGuard<T> {
+        // println!("write");
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::Write,
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcWriteGuard {
+            rwlock: self.darc.clone(),
+            marker: PhantomData,
+        }
+        // inner.item().write(remote_rwlock_addr)
+    }
+
+    #[doc(alias("Collective"))]
+    /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
+    ///
+    /// The current task will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # Collective Operation
+    /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::*;
+    ///
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let mut counter = self.counter.async_write().await; // await until we get the write lock
+    ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
+    ///     }
+    ///  }
+    /// //-------------
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// world.exec_am_all(DarcAm {counter: counter.clone()});
+    /// let mut guard = counter.blocking_collective_write();
+    /// *guard += my_pe;
+    /// drop(guard); //release the lock
+    /// world.wait_all(); // wait for my active message to return
+    /// world.barrier(); //at this point all updates will have been performed
+    ///```
+    pub fn blocking_collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
+        // println!("async write");
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::CollectiveWrite(collective_cnt),
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcCollectiveWriteGuard {
+            rwlock: self.darc.clone(),
+            collective_cnt: collective_cnt,
+            marker: PhantomData,
+        }
+    }
 }
 
 impl<T> GlobalRwDarc<T> {
@@ -1025,14 +1032,12 @@ impl<T> GlobalRwDarc<T> {
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
         let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<T>>>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
         d.inner_mut()
-            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(Box::new(
-                item,
-            ))))));
+            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
         LocalRwDarc { darc: d }
     }
 }
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 26557efb..f6b4c9e3 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -14,7 +14,6 @@ use crate::darc::global_rw_darc::{DistRwLock, GlobalRwDarc};
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
 use crate::lamellar_team::IntoLamellarTeam;
-use crate::scheduler::SchedulerQueue;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 /// A local read-write `Darc`
@@ -34,11 +33,11 @@ pub struct LocalRwDarc<T: 'static> {
         serialize_with = "localrw_serialize2",
         deserialize_with = "localrw_from_ndarc2"
     )]
-    pub(crate) darc: Darc<Arc<RwLock<Box<T>>>>, //we need to wrap WrLock in an Arc so we get access to ArcReadGuard and ArcWriteGuard
+    pub(crate) darc: Darc<Arc<RwLock<T>>>, //we need to wrap WrLock in an Arc so we get access to ArcReadGuard and ArcWriteGuard
 }
 
-unsafe impl<T: Send> Send for LocalRwDarc<T> {}
-unsafe impl<T: Sync> Sync for LocalRwDarc<T> {}
+unsafe impl<T: Send> Send for LocalRwDarc<T> {} //we are protecting internally with an WrLock
+unsafe impl<T: Send> Sync for LocalRwDarc<T> {} //we are protecting internally with an WrLock
 
 impl<T> LamellarEnv for LocalRwDarc<T> {
     fn my_pe(&self) -> usize {
@@ -84,7 +83,7 @@ impl<T> crate::active_messaging::DarcSerde for LocalRwDarc<T> {
 }
 
 impl<T> LocalRwDarc<T> {
-    fn inner(&self) -> &DarcInner<Arc<RwLock<Box<T>>>> {
+    fn inner(&self) -> &DarcInner<Arc<RwLock<T>>> {
         self.darc.inner()
     }
 
@@ -123,67 +122,10 @@ impl<T> LocalRwDarc<T> {
             self.inner()
         );
     }
+}
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Aquires a reader lock of this LocalRwDarc local to this PE.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while any writer currentl has access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the read access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    // ///
-    // /// # Note
-    // /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::prelude::*;
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let counter = self.counter.read(); //block until we get the write lock
-    // ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let guard = counter.read();
-    // /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    // ///```
-    // pub fn read(&self) -> RwLockReadGuardArc<Box<T>> {
-    //     // println!("trying to get read lock");
-    //     match self.darc.try_read_arc() {
-    //         Some(guard) => {
-    //             // println!("got read lock");
-    //             guard
-    //         }
-    //         None => {
-    //             // println!("did not get read lock");
-    //             let _lock_fut = self.darc.read_arc();
-    //             self.darc.team().scheduler.block_on(async move {
-    //                 // println!("async trying to get read lock");
-    //                 _lock_fut.await
-    //             })
-    //         }
-    //     }
-    // }
-
+impl<T: Sync + Send> LocalRwDarc<T> {
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
     /// Aquires a reader lock of this LocalRwDarc local to this PE.
     ///
     /// The current THREAD will be blocked until the lock has been acquired.
@@ -211,7 +153,7 @@ impl<T> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let counter = self.counter.read().await; //block until we get the write lock
+    ///         let counter = self.counter.read(); //block until we get the write lock
     ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
     ///     }
     ///  }
@@ -220,76 +162,67 @@ impl<T> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let guard = world.block_on(counter.read());
+    /// let guard = counter.blocking_read();
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
-    pub async fn read(&self) -> RwLockReadGuardArc<Box<T>> {
+    pub fn blocking_read(&self) -> RwLockReadGuardArc<T> {
+        let self_clone: LocalRwDarc<T> = self.clone();
+        self.darc
+            .team()
+            .block_on(async move { self_clone.darc.read_arc().await })
+    }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// TODO: UPDATE
+    /// Aquires a reader lock of this LocalRwDarc local to this PE.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while any writer currentl has access to the lock
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
+    ///
+    /// # Note
+    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::prelude::*;
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let counter = self.counter.read().await; //block until we get the write lock
+    ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
+    ///     }
+    ///  }
+    /// //-------------
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
+    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let guard = counter.read().await;
+    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    /// });
+    ///```
+    pub async fn read(&self) -> RwLockReadGuardArc<T> {
         // println!("async trying to get read lock");
         let lock = self.darc.read_arc().await;
         // println!("got async read lock");
         lock
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Aquires the writer lock of this LocalRwDarc local to this PE.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    // ///
-    // /// # Note
-    // /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::prelude::*;
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let mut counter = self.counter.write(); //block until we get the write lock
-    // ///         **counter += 1;
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let mut guard = counter.write();
-    // /// **guard += my_pe;
-    // ///```
-    // pub fn write(&self) -> RwLockWriteGuardArc<Box<T>> {
-    //     // println!("trying to get write lock");
-    //     match self.darc.try_write_arc() {
-    //         Some(guard) => {
-    //             // println!("got write lock");
-    //             guard
-    //         }
-    //         None => {
-    //             // println!("did not get write lock");
-    //             let lock_fut = self.darc.write_arc();
-    //             self.darc.team().scheduler.block_on(async move {
-    //                 // println!("async trying to get write lock");
-    //                 lock_fut.await
-    //             })
-    //         }
-    //     }
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
     /// Aquires the writer lock of this LocalRwDarc local to this PE.
     ///
     /// The current THREAD will be blocked until the lock has been acquired.
@@ -317,7 +250,7 @@ impl<T> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let mut counter = self.counter.write().await; //block until we get the write lock
+    ///         let mut counter = self.counter.write(); //block until we get the write lock
     ///         **counter += 1;
     ///     }
     ///  }
@@ -326,10 +259,61 @@ impl<T> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.write());
+    /// let mut guard = counter.blocking_write();
     /// **guard += my_pe;
     ///```
-    pub async fn write(&self) -> RwLockWriteGuardArc<Box<T>> {
+    pub fn blocking_write(&self) -> RwLockWriteGuardArc<T> {
+        // println!("trying to get write lock");
+        let self_clone: LocalRwDarc<T> = self.clone();
+        self.darc
+            .team()
+            .block_on(async move { self_clone.darc.write_arc().await })
+    }
+
+    #[doc(alias("One-sided", "onesided"))]
+    ///
+    /// Aquires the writer lock of this LocalRwDarc local to this PE.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
+    ///
+    /// # Note
+    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::prelude::*;
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let mut counter = self.counter.write().await; //block until we get the write lock
+    ///         **counter += 1;
+    ///     }
+    ///  }
+    /// //-------------
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone()block_on(async move{
+    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.write();
+    ///     **guard += my_pe;
+    /// })
+    ///```
+    pub async fn write(&self) -> RwLockWriteGuardArc<T> {
         // println!("async trying to get write lock");
         let lock = self.darc.write_arc().await;
         // println!("got async write lock");
@@ -359,11 +343,7 @@ impl<T> LocalRwDarc<T> {
     /// ```
     pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> Result<LocalRwDarc<T>, IdError> {
         Ok(LocalRwDarc {
-            darc: Darc::try_new(
-                team,
-                Arc::new(RwLock::new(Box::new(item))),
-                DarcMode::LocalRw,
-            )?,
+            darc: Darc::try_new(team, Arc::new(RwLock::new(item)), DarcMode::LocalRw)?,
         })
     }
 
@@ -378,12 +358,12 @@ impl<T> LocalRwDarc<T> {
     // }
 
     #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a regular [Darc]
+    /// Converts this LocalRwDarc into a [GlobalRwDarc]
     ///
     /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
     ///
     /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
@@ -395,9 +375,9 @@ impl<T> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = five.into_darc();
+    /// let five_as_globaldarc = five.into_globalrw();
     /// ```
-    pub fn into_darc(self) -> Darc<T> {
+    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -406,37 +386,40 @@ impl<T> LocalRwDarc<T> {
                 inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                     .expect("invalid darc pointer"),
             },
-            DarcMode::Darc,
+            DarcMode::GlobalRw,
             0,
         ));
         // println!("after block on outstanding");
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
-        let mut arc_item =
-            unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<Box<T>>>)).clone() };
-
-        let item: Box<T> = loop {
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+        let item: T = loop {
             arc_item = match Arc::try_unwrap(arc_item) {
                 Ok(item) => break item.into_inner(),
                 Err(arc_item) => arc_item,
             };
         };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<T>,
+            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
-        d.inner_mut().update_item(Box::into_raw(item));
-        d
+        d.inner_mut()
+            .update_item(Box::into_raw(Box::new(DistRwLock::new(
+                item,
+                self.inner().team(),
+            ))));
+        GlobalRwDarc { darc: d }
     }
+}
 
+impl<T: Send + Sync> LocalRwDarc<T> {
     #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a [GlobalRwDarc]
+    /// Converts this LocalRwDarc into a regular [Darc]
     ///
     /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
     ///
     /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
@@ -448,9 +431,9 @@ impl<T> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = five.into_globalrw();
+    /// let five_as_darc = five.into_darc();
     /// ```
-    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
+    pub fn into_darc(self) -> Darc<T> {
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -459,30 +442,27 @@ impl<T> LocalRwDarc<T> {
                 inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                     .expect("invalid darc pointer"),
             },
-            DarcMode::GlobalRw,
+            DarcMode::Darc,
             0,
         ));
         // println!("after block on outstanding");
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let mut arc_item =
-            unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<Box<T>>>)).clone() };
-        let item: Box<T> = loop {
+                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+
+        let item: T = loop {
             arc_item = match Arc::try_unwrap(arc_item) {
                 Ok(item) => break item.into_inner(),
                 Err(arc_item) => arc_item,
             };
         };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
+            inner: self.darc.inner as *mut DarcInner<T>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(DistRwLock::new(
-                *item,
-                self.inner().team(),
-            ))));
-        GlobalRwDarc { darc: d }
+        d.inner_mut().update_item(Box::into_raw(Box::new(item))); //the darc will free this approriately
+        d
     }
 }
 
@@ -495,9 +475,17 @@ impl<T> Clone for LocalRwDarc<T> {
     }
 }
 
-impl<T: fmt::Display> fmt::Display for LocalRwDarc<T> {
+impl<T: fmt::Display + Sync + Send> fmt::Display for LocalRwDarc<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(&**self.darc.team().scheduler.block_on(self.read()), f)
+        let lock: LocalRwDarc<T> = self.clone();
+        fmt::Display::fmt(
+            &self
+                .darc
+                .team()
+                .scheduler
+                .block_on(async move { lock.read().await }),
+            f,
+        )
     }
 }
 
@@ -525,10 +513,7 @@ impl<T: fmt::Display> fmt::Display for LocalRwDarc<T> {
 // }
 
 #[doc(hidden)]
-pub fn localrw_serialize2<S, T>(
-    localrw: &Darc<Arc<RwLock<Box<T>>>>,
-    s: S,
-) -> Result<S::Ok, S::Error>
+pub fn localrw_serialize2<S, T>(localrw: &Darc<Arc<RwLock<T>>>, s: S) -> Result<S::Ok, S::Error>
 where
     S: Serializer,
 {
@@ -539,9 +524,7 @@ where
 }
 
 #[doc(hidden)]
-pub fn localrw_from_ndarc2<'de, D, T>(
-    deserializer: D,
-) -> Result<Darc<Arc<RwLock<Box<T>>>>, D::Error>
+pub fn localrw_from_ndarc2<'de, D, T>(deserializer: D) -> Result<Darc<Arc<RwLock<T>>>, D::Error>
 where
     D: Deserializer<'de>,
 {
@@ -555,8 +538,8 @@ where
     Ok(Darc::from(ndarc))
 }
 
-// impl<T> From<Darc<Arc<RwLock<Box<T>>>>> for __NetworkDarc {
-//     fn from(darc: Darc<Arc<RwLock<Box<T>>>>) -> Self {
+// impl<T> From<Darc<Arc<RwLock<T>>>> for __NetworkDarc {
+//     fn from(darc: Darc<Arc<RwLock<T>>>) -> Self {
 //         // println!("rwdarc to net darc");
 //         // darc.print();
 //         let team = &darc.inner().team();
@@ -570,8 +553,8 @@ where
 //     }
 // }
 
-// impl<T> From<&Darc<Arc<RwLock<Box<T>>>>> for __NetworkDarc {
-//     fn from(darc: &Darc<Arc<RwLock<Box<T>>>>) -> Self {
+// impl<T> From<&Darc<Arc<RwLock<T>>>> for __NetworkDarc {
+//     fn from(darc: &Darc<Arc<RwLock<T>>>) -> Self {
 //         // println!("rwdarc to net darc");
 //         // darc.print();
 //         let team = &darc.inner().team();
@@ -585,14 +568,14 @@ where
 //     }
 // }
 
-// impl<T> From<__NetworkDarc> for Darc<Arc<RwLock<Box<T>>>> {
+// impl<T> From<__NetworkDarc> for Darc<Arc<RwLock<T>>> {
 //     fn from(ndarc: __NetworkDarc) -> Self {
 //         // println!("rwdarc from net darc");
 
 //         if let Some(lamellae) = LAMELLAES.read().get(&ndarc.backend) {
 //             let darc = Darc {
 //                 inner: lamellae.local_addr(ndarc.orig_world_pe, ndarc.inner_addr)
-//                     as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+//                     as *mut DarcInner<Arc<RwLock<T>>>,
 //                 src_pe: ndarc.orig_team_pe,
 //                 // phantom: PhantomData,
 //             };
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 2cce1f3b..2bd57509 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -2,7 +2,7 @@ use crate::lamellae::comm::*;
 use crate::lamellae::{
     Des, Lamellae, LamellaeComm, LamellaeRDMA, SerializedData, SerializedDataOps,
 };
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 
 use parking_lot::Mutex;
 
@@ -1427,7 +1427,8 @@ impl CommandQueue {
                                                 //     "[{:?}] recv_data submitting work",
                                                 //     std::thread::current().id(),
                                                 // );
-                                                scheduler2.submit_work(work_data, lamellae.clone());
+                                                scheduler2
+                                                    .submit_remote_am(work_data, lamellae.clone());
                                                 if cmd_cnt_clone.fetch_sub(1, Ordering::SeqCst) == 1
                                                 {
                                                     cq.send_free(src, cmd_buf_cmd);
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index ca76dc34..37bbcb2f 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -6,7 +6,7 @@ use crate::lamellae::{
     LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
diff --git a/src/lamellae/shmem_lamellae.rs b/src/lamellae/shmem_lamellae.rs
index 49e50716..b4008bcf 100644
--- a/src/lamellae/shmem_lamellae.rs
+++ b/src/lamellae/shmem_lamellae.rs
@@ -7,7 +7,7 @@ use crate::lamellae::{
     LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 58cee47a..81a6b317 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -3,7 +3,7 @@ use crate::darc::Darc;
 use crate::lamellae::{Des, SerializedData};
 use crate::lamellar_arch::LamellarArchRT;
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use async_trait::async_trait;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 37c64972..26e45ec0 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -4,7 +4,7 @@ use crate::lamellar_arch::LamellarArchRT;
 use crate::lamellar_request::*;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::{ReqId, Scheduler, SchedulerQueue};
+use crate::scheduler::{ReqId, Scheduler};
 use crate::Darc;
 
 use crate::active_messaging::registered_active_message::{AmId, AMS_EXECS, AMS_IDS, AM_ID_START};
@@ -463,10 +463,7 @@ impl ActiveMessaging for LamellarTaskGroup {
         self.exec_am_local_inner(am).into_future()
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         tracing::trace_span!("block_on").in_scope(|| self.team.scheduler.block_on(f))
     }
 }
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index b819ac02..d238e5de 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -9,7 +9,7 @@ use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, LamellarMemoryRegion,
     MemoryRegion, RemoteMemoryRegion,
 };
-use crate::scheduler::{ReqId, Scheduler, SchedulerQueue};
+use crate::scheduler::{ReqId, Scheduler};
 #[cfg(feature = "nightly")]
 use crate::utils::ser_closure;
 
@@ -485,10 +485,7 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.barrier();
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         trace_span!("block_on").in_scope(|| self.team.scheduler.block_on(f))
@@ -923,7 +920,7 @@ impl LamellarTeamRT {
             // what does it mean if we drop a parent team while a sub_team is valid?
             if let None = &self.parent {
                 // println!("shutdown lamellae, going to shutdown scheduler");
-                self.scheduler.shutdown_threads();
+                self.scheduler.begin_shutdown();
                 self.put_dropped();
                 self.drop_barrier();
                 self.lamellae.shutdown();
@@ -1324,7 +1321,8 @@ impl LamellarTeamRT {
 
     pub(crate) fn block_on<F>(&self, f: F) -> F::Output
     where
-        F: Future,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index f8116bf0..f3d7726d 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -6,7 +6,7 @@ use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, RemoteMemoryRegion,
 };
-use crate::scheduler::{create_scheduler, SchedulerQueue, SchedulerType};
+use crate::scheduler::{create_scheduler, ExecutorType};
 // use log::trace;
 
 use tracing::*;
@@ -75,10 +75,7 @@ impl ActiveMessaging for LamellarWorld {
         self.team.barrier();
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         trace_span!("block_on").in_scope(|| self.team_rt.scheduler.block_on(f))
     }
 }
@@ -325,7 +322,7 @@ impl Drop for LamellarWorld {
 /// # Examples
 ///
 ///```
-/// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+/// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
 /// // can also use and of the module preludes
 /// // use lamellar::active_messaging::prelude::*;
 /// // use lamellar::array::prelude::*;
@@ -334,14 +331,14 @@ impl Drop for LamellarWorld {
 ///
 /// let world = LamellarWorldBuilder::new()
 ///                             .with_lamellae(Backend::Local)
-///                             .with_scheduler(SchedulerType::WorkStealing)
+///                             .with_executor(ExecutorType::LamellarWorkStealing)
 ///                             .build();
 ///```
 #[derive(Debug)]
 pub struct LamellarWorldBuilder {
     primary_lamellae: Backend,
     // secondary_lamellae: HashSet<Backend>,
-    scheduler: SchedulerType,
+    executor: ExecutorType,
     num_threads: usize,
 }
 
@@ -357,7 +354,7 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
     /// // can also use and of the module preludes
     /// // use lamellar::active_messaging::prelude::*;
     /// // use lamellar::array::prelude::*;
@@ -366,30 +363,35 @@ impl LamellarWorldBuilder {
     ///
     /// let world = LamellarWorldBuilder::new()
     ///                             .with_lamellae(Backend::Local)
-    ///                             .with_scheduler(SchedulerType::WorkStealing)
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing)
     ///                             .build();
     ///```
     #[tracing::instrument(skip_all)]
     pub fn new() -> LamellarWorldBuilder {
         // simple_logger::init().unwrap();
         // trace!("New world builder");
-        let scheduler = match std::env::var("LAMELLAR_SCHEDULER") {
+        let mut executor = match std::env::var("LAMELLAR_EXECUTOR") {
             Ok(val) => {
-                let scheduler = val.parse::<usize>().unwrap();
-                if scheduler == 0 {
-                    SchedulerType::WorkStealing
+                let executor = val.parse::<usize>().unwrap();
+                if executor == 0 {
+                    ExecutorType::LamellarWorkStealing
                 }
                 // else if scheduler == 1 {
-                //     SchedulerType::NumaWorkStealing
+                //     ExecutorType::NumaWorkStealing
                 // } else if scheduler == 2 {
-                //     SchedulerType::NumaWorkStealing2
+                //     ExecutorType::NumaWorkStealing2
                 // }
                 else {
-                    SchedulerType::WorkStealing
+                    ExecutorType::LamellarWorkStealing
                 }
             }
-            Err(_) => SchedulerType::WorkStealing,
+            Err(_) => ExecutorType::LamellarWorkStealing,
         };
+        #[cfg(feature = "tokio-executor")]
+        {
+            executor = ExecutorType::Tokio;
+        }
+
         let num_threads = match std::env::var("LAMELLAR_THREADS") {
             Ok(n) => {
                 if let Ok(num_threads) = n.parse::<usize>() {
@@ -409,7 +411,7 @@ impl LamellarWorldBuilder {
         LamellarWorldBuilder {
             primary_lamellae: Default::default(),
             // secondary_lamellae: HashSet::new(),
-            scheduler: scheduler,
+            executor: executor,
             num_threads: num_threads,
         }
     }
@@ -442,24 +444,24 @@ impl LamellarWorldBuilder {
     // }
 
     #[doc(alias = "Collective")]
-    /// Specify the scheduler to use for this execution
+    /// Specify the executor to use for this execution
     ///
     /// # Collective Operation
-    /// While simply calling `with_scheduler` is not collective by itself (i.e. there is no internal barrier that would deadlock,
+    /// While simply calling `with_executor` is not collective by itself (i.e. there is no internal barrier that would deadlock,
     /// as the remote fabric is not initiated until after a call to `build`), it is necessary that the same
     /// parameters are used by all PEs that will exist in the world.
     ///
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,ExecutorType};
     ///
     /// let builder = LamellarWorldBuilder::new()
-    ///                             .with_scheduler(SchedulerType::WorkStealing);
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing);
     ///```
     #[tracing::instrument(skip_all)]
-    pub fn with_scheduler(mut self, sched: SchedulerType) -> LamellarWorldBuilder {
-        self.scheduler = sched;
+    pub fn with_executor(mut self, sched: ExecutorType) -> LamellarWorldBuilder {
+        self.executor = sched;
         self
     }
 
@@ -473,7 +475,7 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,ExecutorType};
     ///
     /// let builder = LamellarWorldBuilder::new()
     ///                             .set_num_workers(10);
@@ -493,11 +495,11 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
     ///
     /// let world = LamellarWorldBuilder::new()
     ///                             .with_lamellae(Backend::Local)
-    ///                             .with_scheduler(SchedulerType::WorkStealing)
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing)
     ///                             .build();
     ///```
     #[tracing::instrument(skip_all)]
@@ -518,14 +520,15 @@ impl LamellarWorldBuilder {
         // println!("{:?}: init_fabric", timer.elapsed());
 
         // timer = std::time::Instant::now();
+
+        // we delay building the scheduler until we know the number of PEs (which is used for message aggregation)
+        // this could be lazyily provided but this is easy enough to do here
         let panic = Arc::new(AtomicU8::new(0));
         let sched_new = Arc::new(create_scheduler(
-            self.scheduler,
+            self.executor,
             num_pes,
             self.num_threads,
             panic.clone(),
-            my_pe,
-            // teams.clone(),
         ));
         // println!("{:?}: create_scheduler", timer.elapsed());
 
diff --git a/src/lib.rs b/src/lib.rs
index 36f74447..de0b420c 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -294,7 +294,7 @@ pub use crate::lamellar_team::LamellarTeam;
 #[doc(hidden)]
 pub use crate::lamellar_team::{ArcLamellarTeam, LamellarTeamRT};
 pub use crate::lamellar_world::*;
-pub use crate::scheduler::SchedulerType;
+pub use crate::scheduler::ExecutorType;
 
 extern crate lamellar_impl;
 #[doc(hidden)]
diff --git a/src/scheduler.rs b/src/scheduler.rs
index f22755e8..97d85179 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -1,13 +1,33 @@
+use crate::active_messaging::batching::simple_batcher::SimpleBatcher;
+use crate::active_messaging::batching::team_am_batcher::TeamAmBatcher;
+use crate::active_messaging::batching::BatcherType;
+use crate::active_messaging::registered_active_message::RegisteredActiveMessages;
 use crate::active_messaging::*;
-use crate::lamellae::{Lamellae, SerializedData};
+use crate::lamellae::{Des, Lamellae, SerializedData};
 
 use enum_dispatch::enum_dispatch;
 use futures::Future;
-use std::sync::atomic::AtomicU8;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 
 pub(crate) mod work_stealing;
-use work_stealing::{WorkStealing, WorkStealingInner};
+use work_stealing::WorkStealing;
+
+#[cfg(feature = "tokio-executor")]
+pub(crate) mod tokio;
+#[cfg(feature = "tokio-executor")]
+use tokio::TokioRt;
+
+// ACTIVE ENUM
+// since atomic enums would be another dependecy
+
+#[repr(u8)]
+#[derive(Copy, Clone, Debug, serde::Serialize, serde::Deserialize)]
+pub(crate) enum SchedulerStatus {
+    Active,
+    Finished,
+    Panic,
+}
 
 // pub(crate) mod numa_work_stealing;
 // use numa_work_stealing::{NumaWorkStealing, NumaWorkStealingInner};
@@ -31,118 +51,254 @@ pub(crate) struct ReqId {
     pub(crate) sub_id: usize,
 }
 
-/// The available worker thread scheduling algorithms
 #[derive(Debug)]
-pub enum SchedulerType {
-    /// The default (and currently only) scheduler, performs workstealing across all worker threads
-    WorkStealing,
-    // NumaWorkStealing,
-    // NumaWorkStealing2,
+pub enum ExecutorType {
+    LamellarWorkStealing,
+    #[cfg(feature = "tokio-executor")]
+    Tokio,
+    // Dyn(impl LamellarExecutor),
 }
 
-#[enum_dispatch(AmeSchedulerQueue)]
-#[derive(Debug)]
-pub(crate) enum AmeScheduler {
-    WorkStealingInner,
-    // NumaWorkStealingInner,
-    // NumaWorkStealing2Inner,
-}
 #[enum_dispatch]
-pub(crate) trait AmeSchedulerQueue {
-    fn submit_am(
-        &self,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    );
-    fn submit_am_immediate(
-        &self,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    );
-    fn submit_work(
-        &self,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        ame: Arc<ActiveMessageEngineType>,
-        msg: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ); //serialized active message
+pub(crate) trait LamellarExecutor {
     fn submit_task<F>(&self, future: F)
     where
-        F: Future<Output = ()>;
+        F: Future + Send + 'static,
+        F::Output: Send;
+
     fn submit_immediate_task<F>(&self, future: F)
     where
-        F: Future<Output = ()>;
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()>;
-    fn exec_task(&self);
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        Self::submit_task(self, future)
+    }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future;
+    fn exec_task(&self) {
+        std::thread::yield_now();
+    }
 
+    fn block_on<F: Future>(&self, future: F) -> F::Output;
+
+    fn set_max_workers(&mut self, num_workers: usize);
+    fn num_workers(&self) -> usize;
     fn shutdown(&self);
-    fn shutdown_threads(&self);
     fn force_shutdown(&self);
-    fn active(&self) -> bool;
 }
 
-#[enum_dispatch(SchedulerQueue)]
+#[enum_dispatch(LamellarExecutor)]
 #[derive(Debug)]
-pub(crate) enum Scheduler {
-    WorkStealing,
-    // NumaWorkStealing,
-    // NumaWorkStealing2,
+pub(crate) enum Executor {
+    WorkStealing(WorkStealing),
+    #[cfg(feature = "tokio-executor")]
+    Tokio(TokioRt),
 }
-#[enum_dispatch]
-pub(crate) trait SchedulerQueue {
-    fn submit_am(&self, am: Am); //serialized active message
-    fn submit_am_immediate(&self, am: Am); //serialized active message
-    fn submit_work(&self, msg: SerializedData, lamellae: Arc<Lamellae>); //serialized active message
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>;
-    fn submit_immediate_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>;
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()>;
-    fn submit_task_node<F>(&self, future: F, node: usize)
+
+#[derive(Debug)]
+pub(crate) struct Scheduler {
+    executor: Arc<Executor>,
+    active_message_engine: RegisteredActiveMessages, //we can eventually abstract this around the ActiveMessageEngine trait but no need currently
+    num_ams: Arc<AtomicUsize>,
+    max_ams: Arc<AtomicUsize>,
+    num_tasks: Arc<AtomicUsize>,
+    max_tasks: Arc<AtomicUsize>,
+    am_stall_mark: Arc<AtomicUsize>,
+    status: Arc<AtomicU8>,
+    panic: Arc<AtomicU8>,
+}
+
+impl Scheduler {
+    pub(crate) fn new(
+        executor: Executor,
+        active_message_engine: RegisteredActiveMessages,
+        am_stall_mark: Arc<AtomicUsize>,
+        status: Arc<AtomicU8>,
+        panic: Arc<AtomicU8>,
+    ) -> Self {
+        Self {
+            executor: Arc::new(executor),
+            active_message_engine,
+            num_ams: Arc::new(AtomicUsize::new(0)),
+            max_ams: Arc::new(AtomicUsize::new(0)),
+            num_tasks: Arc::new(AtomicUsize::new(0)),
+            max_tasks: Arc::new(AtomicUsize::new(0)),
+            am_stall_mark,
+            status,
+            panic,
+        }
+    }
+    pub(crate) fn submit_am(&self, am: Am) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            ame.process_msg(am, executor, am_stall_mark, false).await;
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_task(am_future);
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn submit_am_immediate(&self, am: Am) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            ame.process_msg(am, executor, am_stall_mark, false).await;
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_immediate_task(am_future);
+    }
+
+    pub(crate) fn submit_remote_am(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            if let Some(header) = data.deserialize_header() {
+                let msg = header.msg;
+                ame.exec_msg(msg, data, lamellae, executor).await;
+            } else {
+                data.print();
+                panic!("should i be here?");
+            }
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_task(am_future);
+    }
+
+    pub(crate) fn submit_task<F>(&self, task: F)
     where
-        F: Future<Output = ()>;
-    fn exec_task(&self);
-    fn block_on<F>(&self, future: F) -> F::Output
+        F: Future<Output = ()> + Send + 'static,
+    {
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        let future = async move {
+            num_tasks.fetch_add(1, Ordering::Relaxed);
+            max_tasks.fetch_add(1, Ordering::Relaxed);
+            task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+        };
+        self.executor.submit_task(future);
+    }
+
+    pub(crate) fn submit_immediate_task<F>(&self, task: F)
     where
-        F: Future;
-    fn shutdown(&self);
-    fn shutdown_threads(&self);
-    fn force_shutdown(&self);
-    fn active(&self) -> bool;
-    fn num_workers(&self) -> usize;
+        F: Future<Output = ()> + Send + 'static,
+    {
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        let future = async move {
+            num_tasks.fetch_add(1, Ordering::Relaxed);
+            max_tasks.fetch_add(1, Ordering::Relaxed);
+            task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+        };
+        self.executor.submit_immediate_task(future);
+    }
+
+    pub(crate) fn exec_task(&self) {
+        if std::thread::current().id() == *crate::MAIN_THREAD {
+            self.executor.exec_task();
+        } else {
+            std::thread::yield_now();
+        }
+    }
+
+    pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
+        self.executor.block_on(task)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn get_executor(&self) -> Arc<Executor> {
+        self.executor.clone()
+    }
+
+    pub(crate) fn active(&self) -> bool {
+        self.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+            || self.num_tasks.load(Ordering::SeqCst) > 3 // the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
+    }
+    pub(crate) fn num_workers(&self) -> usize {
+        self.executor.num_workers()
+    }
+    pub(crate) fn begin_shutdown(&self) {
+        self.status
+            .store(SchedulerStatus::Finished as u8, Ordering::SeqCst);
+    }
+    pub(crate) fn shutdown(&self) {
+        let mut timer = std::time::Instant::now();
+        while self.panic.load(Ordering::SeqCst) == 0 && self.num_tasks.load(Ordering::Relaxed) > 3
+        //TODO maybe this should be > 2
+        {
+            //the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
+            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                println!(
+                    "shurtdown timeout, tasks remaining: {:?} panic: {:?}",
+                    self.num_tasks.load(Ordering::Relaxed),
+                    self.panic.load(Ordering::SeqCst),
+                );
+                timer = std::time::Instant::now();
+            }
+            std::thread::yield_now()
+        }
+        self.executor.shutdown();
+    }
+    pub(crate) fn force_shutdown(&self) {
+        self.status
+            .store(SchedulerStatus::Panic as u8, Ordering::SeqCst);
+        self.executor.force_shutdown();
+    }
 }
 
 pub(crate) fn create_scheduler(
-    sched: SchedulerType,
+    executor: ExecutorType,
     num_pes: usize,
     num_workers: usize,
     panic: Arc<AtomicU8>,
-    my_pe: usize,
-    // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
 ) -> Scheduler {
-    match sched {
-        SchedulerType::WorkStealing => Scheduler::WorkStealing(work_stealing::WorkStealing::new(
-            num_pes,
-            num_workers,
-            panic,
-            my_pe,
-        )), // SchedulerType::NumaWorkStealing => {
-            //     Scheduler::NumaWorkStealing(numa_work_stealing::NumaWorkStealing::new(num_pes))
-            // }
-            // SchedulerType::NumaWorkStealing2 => {
-            //     Scheduler::NumaWorkStealing2(numa_work_stealing2::NumaWorkStealing2::new(num_pes))
-            // }
-    }
+    let am_stall_mark = Arc::new(AtomicUsize::new(0));
+    let status = Arc::new(AtomicU8::new(SchedulerStatus::Active as u8));
+    let executor = match executor {
+        ExecutorType::LamellarWorkStealing => {
+            WorkStealing::new(num_workers, status.clone(), panic.clone()).into()
+        }
+        #[cfg(feature = "tokio-executor")]
+        ExecutorType::Tokio => TokioRt::new(num_workers).into(),
+    };
+
+    let batcher = match std::env::var("LAMELLAR_BATCHER") {
+        Ok(n) => {
+            let n = n.parse::<usize>().unwrap();
+            if n == 1 {
+                BatcherType::Simple(SimpleBatcher::new(num_pes, am_stall_mark.clone()))
+            } else {
+                BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone()))
+            }
+        }
+        Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone())),
+    };
+    Scheduler::new(
+        executor,
+        RegisteredActiveMessages::new(batcher),
+        am_stall_mark,
+        status,
+        panic,
+    )
 }
diff --git a/src/scheduler/numa_work_stealing.rs b/src/scheduler/numa_work_stealing.rs
index fffc7880..6cf2aa28 100644
--- a/src/scheduler/numa_work_stealing.rs
+++ b/src/scheduler/numa_work_stealing.rs
@@ -235,10 +235,7 @@ impl AmeSchedulerQueue for NumaWorkStealingInner {
         task.detach();
     }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, future: F) -> F::Output {
         let work_inj = self.work_inj[self
             .local_work_inj
             .get_or(|| AtomicUsize::new(0))
@@ -503,7 +500,7 @@ impl NumaWorkStealingInner {
 
 #[derive(Debug)]
 pub(crate) struct NumaWorkStealing {
-    inner: Arc<AmeScheduler>,
+    inner: &(impl SchedulerQueue + Sync + std::fmt::Debug),
     ame: Arc<ActiveMessageEngineType>,
 }
 impl NumaWorkStealing {
diff --git a/src/scheduler/numa_work_stealing2.rs b/src/scheduler/numa_work_stealing2.rs
index bb7b723b..49fe8380 100644
--- a/src/scheduler/numa_work_stealing2.rs
+++ b/src/scheduler/numa_work_stealing2.rs
@@ -431,7 +431,7 @@ thread_local! {
 
 #[derive(Debug)]
 pub(crate) struct NumaWorkStealing2 {
-    inners: Vec<Arc<AmeScheduler>>,
+    inners: Vec<&(impl SchedulerQueue + Sync + std::fmt::Debug)>,
     ames: Vec<Arc<ActiveMessageEngineType>>,
     node_mask: usize,
 }
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
new file mode 100644
index 00000000..f9e14ac1
--- /dev/null
+++ b/src/scheduler/tokio.rs
@@ -0,0 +1,88 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+
+use tokio::runtime::Runtime;
+
+use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::Worker;
+use futures::Future;
+use futures_lite::FutureExt;
+use rand::prelude::*;
+use std::panic;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc; //, Weak};
+use std::thread;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+pub(crate) struct TokioRt {
+    max_num_threads: usize,
+    rt: Runtime,
+}
+
+impl LamellarExecutor for TokioRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl TokioRt {
+    pub(crate) fn new(num_workers: usize) -> TokioRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        TokioRt {
+            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            rt: tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(num_workers + 1)
+                .enable_all()
+                .build()
+                .unwrap(),
+        }
+    }
+}
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index bcf7c158..e7a06fe4 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -1,10 +1,4 @@
-use crate::active_messaging::{ActiveMessageEngine, ActiveMessageEngineType, Am};
-use crate::lamellae::{Des, Lamellae, SerializedData};
-use crate::scheduler::batching::simple_batcher::SimpleBatcher;
-use crate::scheduler::batching::team_am_batcher::TeamAmBatcher;
-use crate::scheduler::batching::BatcherType;
-use crate::scheduler::registered_active_message::RegisteredActiveMessages;
-use crate::scheduler::{AmeScheduler, AmeSchedulerQueue, SchedulerQueue};
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
 
 use tracing::*;
 
@@ -13,29 +7,14 @@ use core_affinity::CoreId;
 use crossbeam::deque::Worker;
 use futures::Future;
 use futures_lite::FutureExt;
-// use parking_lot::Mutex;
 use rand::prelude::*;
-// use std::collections::BTreeMap;
 use std::panic;
 use std::process;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc; //, Weak};
 use std::thread;
-// use std::time::Instant;
-// use std::time::Instant;
-
-const ACTIVE: u8 = 0;
-const FINISHED: u8 = 1;
-const PANIC: u8 = 2;
 
 static TASK_ID: AtomicUsize = AtomicUsize::new(0);
-
-// static LAST_PRINTED_TASKS: AtomicUsize = AtomicUsize::new(0);
-
-// static OUTSTANDING_REQS:  Mutex<HashMap<usize,usize>> = parking_lot::const_mutex(HashMap::new());
-// lazy_static!{ static ref OUTSTANDING_REQS: Mutex<BTreeMap<usize,usize>> = Mutex::new(BTreeMap::new()); }
-
-
 #[derive(Debug)]
 pub(crate) struct WorkStealingThread {
     imm_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
@@ -43,52 +22,37 @@ pub(crate) struct WorkStealingThread {
     work_stealers: Vec<crossbeam::deque::Stealer<Runnable<usize>>>,
     work_q: Worker<Runnable<usize>>,
     work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
     panic: Arc<AtomicU8>,
 }
 
-
-
 impl WorkStealingThread {
     #[tracing::instrument(skip_all)]
     fn run(
         worker: WorkStealingThread,
         active_cnt: Arc<AtomicUsize>,
-        num_tasks: Arc<AtomicUsize>,
-        _max_tasks: Arc<AtomicUsize>,
+        // num_tasks: Arc<AtomicUsize>,
         id: CoreId,
-        _my_pe: usize,
     ) -> thread::JoinHandle<()> {
         let builder = thread::Builder::new().name("worker_thread".into());
         builder
             .spawn(move || {
                 // println!("TestSchdulerWorker thread running {:?} core: {:?}", std::thread::current().id(), id);
-                // let mut num_task_executed = 0;
                 let _span = trace_span!("WorkStealingThread::run");
                 core_affinity::set_for_current(id);
                 active_cnt.fetch_add(1, Ordering::SeqCst);
                 let mut rng = rand::thread_rng();
                 let t = rand::distributions::Uniform::from(0..worker.work_stealers.len());
                 let mut timer = std::time::Instant::now();
-                // let mut cur_tasks = num_tasks.load(Ordering::SeqCst);
                 while worker.panic.load(Ordering::SeqCst) == 0
-                    && (worker.active.load(Ordering::SeqCst) == ACTIVE
-                        || !(worker.work_q.is_empty()
-                            && worker.work_inj.is_empty()
-                            && worker.imm_inj.is_empty())
-                        || num_tasks.load(Ordering::SeqCst) > 1)
+                    && (
+                        worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+                            || !(worker.work_q.is_empty()
+                                && worker.work_inj.is_empty()
+                                && worker.imm_inj.is_empty())
+                        // || num_tasks.load(Ordering::SeqCst) > 1
+                    )
                 {
-                    // let ot = Instant::now();
-                    // if cur_tasks != num_tasks.load(Ordering::SeqCst){
-                    //     println!(
-                    //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                    //         worker.work_q.len(),
-                    //         worker.work_inj.len(),
-                    //         num_tasks.load(Ordering::SeqCst)
-                    //     );
-                    //     cur_tasks = num_tasks.load(Ordering::SeqCst);
-
-                    // }
                     let omsg = if !worker.imm_inj.is_empty() {
                         worker.imm_inj.steal().success()
                     } else {
@@ -113,50 +77,32 @@ impl WorkStealingThread {
                     };
 
                     if let Some(runnable) = omsg {
-                        if worker.active.load(Ordering::SeqCst) == FINISHED
-                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT 
+                        if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
                         {
                             println!("runnable {:?}", runnable);
                             println!(
-                                "work_q size {:?} work inj size {:?} num_tasks {:?}",
+                                "work_q size {:?} work inj size {:?}", // num_tasks {:?}",
                                 worker.work_q.len(),
                                 worker.work_inj.len(),
-                                num_tasks.load(Ordering::SeqCst)
+                                // num_tasks.load(Ordering::SeqCst)
                             );
                             timer = std::time::Instant::now();
                         }
-                        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-                        // if LAST_PRINTED_TASKS.load(Ordering::Relaxed) != num_tasks.load(Ordering::Relaxed) {
-                        //     LAST_PRINTED_TASKS.store(num_tasks.load(Ordering::Relaxed), Ordering::Relaxed);
-                        //     let work_stealers_lens = worker.work_stealers.iter().map(|x| x.len()).collect::<Vec<_>>();
-                        //     println!("[{:?}] (worker thread) Executing task {:?}, num_tasks: {:?} {:?} {:?} {work_stealers_lens:?} {:?}", std::thread::current().id(), runnable.metadata(),num_tasks.load(Ordering::Relaxed), worker.imm_inj.len(),worker.work_inj.len(), OUTSTANDING_REQS.lock());
-                        // }
                         runnable.run();
                     }
-                    if worker.active.load(Ordering::SeqCst) == FINISHED
-                        && timer.elapsed().as_secs_f64() >  *crate::DEADLOCK_TIMEOUT 
+                    if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
                         && (worker.work_q.len() > 0 || worker.work_inj.len() > 0)
                     {
                         println!(
-                            "work_q size {:?} work inj size {:?} num_tasks {:?}",
+                            "work_q size {:?} work inj size {:?} ", // num_tasks {:?}",
                             worker.work_q.len(),
                             worker.work_inj.len(),
-                            num_tasks.load(Ordering::SeqCst)
+                            // num_tasks.load(Ordering::SeqCst)
                         );
                         timer = std::time::Instant::now();
                     }
-                    // if timer.elapsed().as_secs_f64() > 10.0 {
-                    //     println!(
-                    //         "[{:?}] work_q size {:?} work inj size {:?} num_tasks {:?} {:?} {:?}",
-                    //         std::thread::current().id(),
-                    //         worker.work_q.len(),
-                    //         worker.work_inj.len(),
-                    //         num_tasks.load(Ordering::SeqCst),
-                    //         worker.active.load(Ordering::SeqCst) == FINISHED,
-                    //         OUTSTANDING_REQS.lock()
-                    //     );
-                    //     timer = std::time::Instant::now()
-                    // }
                     std::thread::yield_now();
                 }
                 active_cnt.fetch_sub(1, Ordering::SeqCst);
@@ -167,334 +113,98 @@ impl WorkStealingThread {
 }
 
 #[derive(Debug)]
-pub(crate) struct WorkStealingInner {
+pub(crate) struct WorkStealing {
+    max_num_threads: usize,
     threads: Vec<thread::JoinHandle<()>>,
     imm_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
     work_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
     work_stealers: Vec<crossbeam::deque::Stealer<Runnable<usize>>>,
     work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
     active_cnt: Arc<AtomicUsize>,
-    num_tasks: Arc<AtomicUsize>,
-    max_tasks: Arc<AtomicUsize>,
-    stall_mark: Arc<AtomicUsize>,
     panic: Arc<AtomicU8>,
 }
 
-impl AmeSchedulerQueue for WorkStealingInner {
-    #[tracing::instrument(skip_all)]
-    fn submit_am(
-        //unserialized request
-        &self,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        // println!("submitting_req");
-        // println!("submit req {:?}",self.num_tasks.load(Ordering::Relaxed)+1);
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit am exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, scheduler, stall_mark, false).await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit am done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked( future, schedule) };
-        // println!("[{:?}] submit am schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-        runnable.schedule();
-        task.detach();
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn submit_am_immediate(
-        //unserialized request
-        &self,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        // println!("submitting_req");
-        // println!("submit req {:?}",self.num_tasks.load(Ordering::Relaxed)+1);
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit am imm exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, scheduler, stall_mark, true).await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit am imm done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future, schedule) };
-        // println!("[{:?}] submit am imm running task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-        runnable.run();
-        task.detach();
-    }
-
-    //this is a serialized request
-    #[tracing::instrument(skip_all)]
-    fn submit_work(
-        &self,
-        scheduler: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-        ame: Arc<ActiveMessageEngineType>,
-        data: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ) {
-        // let work_inj = self.work_inj.clone();
-        // println!("submit work {:?}", self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let future = move|_cur_task|async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            if let Some(header) = data.deserialize_header() {
-                let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, scheduler).await;
-            } else {
-                data.print();
-                panic!("should i be here?");
-            }
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future, schedule) };
-        // println!("[{:?}] submit work schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn submit_task<F>(&self, future: F)
+impl LamellarExecutor for WorkStealing {
+    fn submit_task<F>(&self, task: F)
     where
-        F: Future<Output = ()>,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
         trace_span!("submit_task").in_scope(|| {
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move|_cur_task| async move {
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-                max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] submit task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-                future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-                // let mut reqs = OUTSTANDING_REQS.lock();
-                // reqs.remove(cur_task);
-                // println!("[{:?}] submit task done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-            };
             let work_inj = self.work_inj.clone();
-            // let schedule = move |runnable| work_inj.push(runnable);
             let schedule = move |runnable| work_inj.push(runnable);
-            // let (runnable, task) = unsafe { async_task::spawn_unchecked(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future2, schedule) };
-            // println!("[{:?}] submit task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
+            let (runnable, task) = Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn(move |_task_id| async move { task.await }, schedule);
 
             runnable.schedule();
             task.detach();
         });
     }
 
-    fn submit_immediate_task<F>(&self, future: F)
+    fn submit_immediate_task<F>(&self, task: F)
     where
-        F: Future<Output = ()>,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
         trace_span!("submit_task").in_scope(|| {
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move |_cur_task| async move {
-                // println!("exec task {:?}",num_tasks.load(Ordering::Relaxed)+1);
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-               max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] submit imm task exec req {:?} {:?} TaskId: {:?} ", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-                future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-                // let mut reqs = OUTSTANDING_REQS.lock();
-                // reqs.remove(cur_task);
-                // println!("[{:?}] submit imm task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-            };
-            let work_inj = self.work_inj.clone();
-            // let schedule = move |runnable| work_inj.push(runnable);
-            let schedule = move |runnable| work_inj.push(runnable);
-            // let (runnable, task) = unsafe { async_task::spawn_unchecked(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future2, schedule) };
-            // println!("[{:?}] submit imm task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-            // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-            runnable.run(); //try to run immediately
-            task.detach();
-        });
-    }
-
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move|_cur_task| async move {
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-                max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] submit imm2 task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-                future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-            //     let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-                // println!("[{:?}] submit imm2 task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-            };
             let imm_inj = self.imm_inj.clone();
-            // let schedule = move |runnable| imm_inj.push(runnable);
             let schedule = move |runnable| imm_inj.push(runnable);
-            // let (runnable, task) = unsafe { async_task::spawn_unchecked(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future2, schedule) };
-            // println!("[{:?}] submit imm2 task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
+            let (runnable, task) = Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn(move |_task_id| async move { task.await }, schedule);
 
             runnable.schedule(); //try to run immediately
             task.detach();
         });
     }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
         trace_span!("block_on").in_scope(|| {
-            // println!(
-            //     "[{:?}] work stealing block on -- num tasks {:?} max tasks {:?}  tasks executed {:?}",
-            //     std::thread::current().id(),
-            //     self.num_tasks.load(Ordering::Relaxed),
-            //     self.max_tasks.load(Ordering::Relaxed),
-            //     0
-            // );
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move|_cur_task| async move { 
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-                 max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] block on task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-                let res = future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-                // let mut reqs = OUTSTANDING_REQS.lock();
-                // reqs.remove(cur_task);
-                // println!("[{:?}] block on task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-                res
-            };
             let work_inj = self.work_inj.clone();
-            // let schedule = move |runnable| work_inj.push(runnable);
             let schedule = move |runnable| work_inj.push(runnable);
-            
-            // let (runnable, mut task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, mut task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future2, schedule) };
+            let (runnable, mut task) = unsafe {
+                Builder::new()
+                    .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                    .spawn_unchecked(move |_task_id| async move { task.await }, schedule)
+            };
             let waker = runnable.waker();
-            // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
             runnable.run(); //try to run immediately
-            // let mut s = std::time::Instant::now();
-            // let mut cnt = 0;
             while !task.is_finished() {
-                self.exec_task();
-                // if s.elapsed().as_secs() > 10 {
-                //     println!(
-                //         "[{:?}] work stealing block on timeout -- num tasks {:?} max tasks {:?}  tasks executed {:?} task id{:?} {:?}",
-                //         std::thread::current().id(),
-                //         self.num_tasks.load(Ordering::Relaxed),
-                //         self.max_tasks.load(Ordering::Relaxed),
-                //         cnt,
-                //         task.metadata(),
-                //         OUTSTANDING_REQS.lock(),
-                //     );
-                //     s = std::time::Instant::now();
-                //     break;
-                // }
-                // cnt += 1;
-                // std::thread::yield_now();
+                self.exec_task(); //try to execute another task while this one is not ready
             }
             let cx = &mut async_std::task::Context::from_waker(&waker);
             if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-                // println!(
-                //     "[{:?}] work stealing block on done -- num tasks {:?} max tasks {:?}  tasks executed {:?} task id{:?}",
-                //     std::thread::current().id(),
-                //     self.num_tasks.load(Ordering::Relaxed),
-                //     self.max_tasks.load(Ordering::Relaxed),
-                //     cnt,
-                //     task.metadata()
-                // );
                 output
             } else {
                 println!(
-                    "[{:?}] work stealing block on failed -- num tasks {:?} max tasks {:?}   task id{:?}",
+                    "[{:?}] work stealing block on failed --  task id{:?}",
                     std::thread::current().id(),
-                    self.num_tasks.load(Ordering::Relaxed),
-                    self.max_tasks.load(Ordering::Relaxed),
                     task.metadata()
                 );
                 panic!("task not ready");
             }
-            
         })
     }
 
     #[tracing::instrument(skip_all)]
     fn shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(FINISHED, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
-        while self.panic.load(Ordering::SeqCst) == 0
-            && (self.active_cnt.load(Ordering::Relaxed) > 0 //num active threads
-            || self.num_tasks.load(Ordering::Relaxed) > 2)
+        while self.panic.load(Ordering::SeqCst) == 0 && self.active_cnt.load(Ordering::Relaxed) > 0
         {
-            //this should be the recvtask, and alloc_task
+            //num active threads
+            self.exec_task();
             std::thread::yield_now()
         }
-        // println!(
-        //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
-        //     self.active_cnt.load(Ordering::Relaxed),
-        //     self.active_cnt.load(Ordering::Relaxed)
-        // );
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn shutdown_threads(&self) {
-        self.active.store(FINISHED, Ordering::SeqCst);
     }
 
     #[tracing::instrument(skip_all)]
     fn force_shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(PANIC, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
+        // println!("work stealing shuting down {:?}", self.status());
+
+        // println!("work stealing shuting down {:?}",self.status());
         let my_id = std::thread::current().id();
         if self.threads.iter().any(|e| e.thread().id() == my_id) {
-            // while self.active_cnt.load(Ordering::Relaxed) > 1 {//num active threads -- wait for all but myself
-            //     std::thread::yield_now()
-            // }
             self.active_cnt.fetch_sub(1, Ordering::SeqCst); // I paniced so I wont actually decrement
         } else {
             while self.active_cnt.load(Ordering::Relaxed) > 0 {
@@ -505,7 +215,7 @@ impl AmeSchedulerQueue for WorkStealingInner {
         }
         // println!(
         //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
+        //     self.status(),
         //     self.active_cnt.load(Ordering::Relaxed),
         //     self.active_cnt.load(Ordering::Relaxed)
         // );
@@ -529,141 +239,46 @@ impl AmeSchedulerQueue for WorkStealingInner {
             } else {
                 self.work_stealers[t.sample(&mut rng)].steal().success()
             }
-           
         };
         if let Some(runnable) = ret {
-            // if LAST_PRINTED_TASKS.load(Ordering::Relaxed) != self.num_tasks.load(Ordering::Relaxed) {
-            //     LAST_PRINTED_TASKS.store(self.num_tasks.load(Ordering::Relaxed), Ordering::Relaxed);
-            //     let work_stealers_lens = self.work_stealers.iter().map(|x| x.len()).collect::<Vec<_>>();
-            //     // println!("[{:?}] (exec_task) Executing task {:?}, num_tasks: {:?} {:?} {:?} {work_stealers_lens:?} {:?}", std::thread::current().id(), runnable.metadata(),self.num_tasks.load(Ordering::Relaxed), self.imm_inj.len(),self.work_inj.len(), OUTSTANDING_REQS.lock());
-            // }
-            // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
             runnable.run();
         }
     }
 
-    #[tracing::instrument(skip_all)]
-    fn active(&self) -> bool {
-        // println!("sched active {:?} {:?}",self.active.load(Ordering::SeqCst) , self.num_tasks.load(Ordering::SeqCst));
-        self.active.load(Ordering::SeqCst) == ACTIVE || self.num_tasks.load(Ordering::SeqCst) > 3
-    }
-}
-
-impl SchedulerQueue for WorkStealing {
-    fn submit_am(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner.submit_am(self, self.ame.clone(), am);
-    }
-
-    fn submit_am_immediate(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner.submit_am_immediate(self, self.ame.clone(), am);
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
     }
 
-    // fn submit_return(&self, src, pe)
-
-    fn submit_work(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
-        self.inner
-            .submit_work(self, self.ame.clone(), data, lamellae);
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn submit_immediate_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_immediate_task(future);
-    }
-
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_immediate_task2(future);
-    }
-
-    fn exec_task(&self) {
-        self.inner.exec_task();
-        std::thread::yield_now();
-    }
-
-    fn submit_task_node<F>(&self, future: F, _node: usize)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        self.inner.block_on(future)
-    }
-
-    fn shutdown(&self) {
-        self.inner.shutdown();
-    }
-
-    fn shutdown_threads(&self) {
-        self.inner.shutdown_threads();
-    }
-
-    fn force_shutdown(&self) {
-        self.inner.force_shutdown();
-    }
-    fn active(&self) -> bool {
-        self.inner.active()
-    }
     fn num_workers(&self) -> usize {
         self.max_num_threads
     }
 }
 
-impl WorkStealingInner {
-    #[tracing::instrument(skip_all)]
+impl WorkStealing {
     pub(crate) fn new(
-        stall_mark: Arc<AtomicUsize>,
         num_workers: usize,
+        status: Arc<AtomicU8>,
         panic: Arc<AtomicU8>,
-        my_pe: usize,
-    ) -> WorkStealingInner {
+    ) -> WorkStealing {
         // println!("new work stealing queue");
-
-        let mut sched = WorkStealingInner {
+        let mut ws = WorkStealing {
+            max_num_threads: num_workers,
             threads: Vec::new(),
             imm_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_stealers: Vec::new(),
             work_flag: Arc::new(AtomicU8::new(0)),
-            active: Arc::new(AtomicU8::new(ACTIVE)),
+            status: status,
             active_cnt: Arc::new(AtomicUsize::new(0)),
-            num_tasks: Arc::new(AtomicUsize::new(0)),
-            max_tasks: Arc::new(AtomicUsize::new(0)),
-            stall_mark: stall_mark,
             panic: panic,
         };
-        sched.init(num_workers, my_pe);
-        sched
+        ws.init();
+        ws
     }
-
     #[tracing::instrument(skip_all)]
-    fn init(&mut self, num_workers: usize, my_pe: usize) {
-        let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<Runnable<usize>>> =
-            vec![];
-        for _i in 0..num_workers {
+    fn init(&mut self) {
+        let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<Runnable<usize>>> = vec![];
+        for _i in 0..self.max_num_threads {
             let work_worker: crossbeam::deque::Worker<Runnable<usize>> =
                 crossbeam::deque::Worker::new_fifo();
             self.work_stealers.push(work_worker.stealer());
@@ -683,7 +298,7 @@ impl WorkStealingInner {
             }
         };
         // println!("core_ids: {:?}",core_ids);
-        for i in 0..num_workers {
+        for i in 0..self.max_num_threads {
             let work_worker = work_workers.pop().unwrap();
             let worker = WorkStealingThread {
                 imm_inj: self.imm_inj.clone(),
@@ -691,17 +306,14 @@ impl WorkStealingInner {
                 work_stealers: self.work_stealers.clone(),
                 work_q: work_worker,
                 work_flag: self.work_flag.clone(),
-                active: self.active.clone(),
+                status: self.status.clone(),
                 panic: self.panic.clone(),
-                // num_tasks: self.num_tasks.clone(),
             };
             self.threads.push(WorkStealingThread::run(
                 worker,
                 self.active_cnt.clone(),
-                self.num_tasks.clone(),
-                self.max_tasks.clone(),
+                // self.num_tasks.clone(),
                 core_ids[i % core_ids.len()],
-                my_pe,
             ));
         }
         while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
@@ -710,54 +322,7 @@ impl WorkStealingInner {
     }
 }
 
-#[derive(Debug)]
-pub(crate) struct WorkStealing {
-    inner: Arc<AmeScheduler>,
-    ame: Arc<ActiveMessageEngineType>,
-    max_num_threads: usize, //including the main thread
-}
-impl WorkStealing {
-    #[tracing::instrument(skip_all)]
-    pub(crate) fn new(
-        num_pes: usize,
-        num_workers: usize,
-        panic: Arc<AtomicU8>,
-        my_pe: usize,
-        // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
-    ) -> WorkStealing {
-        // println!("new work stealing queue");
-        let stall_mark = Arc::new(AtomicUsize::new(0));
-        let inner = Arc::new(AmeScheduler::WorkStealingInner(WorkStealingInner::new(
-            stall_mark.clone(),
-            num_workers,
-            panic.clone(),
-            my_pe,
-        )));
-
-        let batcher = match std::env::var("LAMELLAR_BATCHER") {
-            Ok(n) => {
-                let n = n.parse::<usize>().unwrap();
-                if n == 1 {
-                    BatcherType::Simple(SimpleBatcher::new(num_pes, stall_mark.clone()))
-                } else {
-                    BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone()))
-                }
-            }
-            Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone())),
-        };
-
-        let sched = WorkStealing {
-            inner: inner.clone(),
-            ame: Arc::new(ActiveMessageEngineType::RegisteredActiveMessages(
-                RegisteredActiveMessages::new(batcher),
-            )),
-            max_num_threads: num_workers,
-        };
-        sched
-    }
-}
-
-impl Drop for WorkStealingInner {
+impl Drop for WorkStealing {
     //when is this called with respect to world?
     #[tracing::instrument(skip_all)]
     fn drop(&mut self) {
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 42f3e53a..5bf47967 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -438,7 +438,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
-            array.batch_add(&world.block_on(input_array.read_local_data()),1);
+            array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -447,7 +447,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
-            array.batch_add(&world.block_on(input_array.read_local_data()),1);
+            array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 4ab2b23e..6c91e0fc 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -539,7 +539,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&world.block_on(input_array.read_local_data()),1));
+            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
             check_results!($array,array,num_pes,reqs,"&LocalLockArray<T>");
 
             // GlobalLockArray<T>------------------------------
@@ -549,7 +549,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&world.block_on(input_array.read_local_data()),1));
+            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
             check_results!($array,array,num_pes,reqs,"&GlobalLockArray<T>");
        }
     }

From 6372f0c860d3a59184582b4300c495f43f9313c9 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Thu, 8 Feb 2024 10:48:02 -0800
Subject: [PATCH 002/116] refactoring internal  block_on calls to async calls

---
 impl/src/array_ops.rs                         |  18 +-
 impl/src/array_reduce.rs                      |   2 +-
 src/array.rs                                  |  68 ++++
 src/array/atomic.rs                           |  22 +-
 src/array/generic_atomic.rs                   |  30 +-
 src/array/generic_atomic/iteration.rs         |  12 +-
 src/array/global_lock_atomic.rs               |  24 +-
 src/array/global_lock_atomic/iteration.rs     |  12 +-
 src/array/iterator/distributed_iterator.rs    | 182 ++++-----
 .../distributed_iterator/consumer/collect.rs  |  35 +-
 src/array/iterator/local_iterator.rs          |  10 +-
 .../local_iterator/consumer/collect.rs        |  24 +-
 src/array/local_lock_atomic.rs                |  24 +-
 src/array/local_lock_atomic/iteration.rs      |  12 +-
 src/array/local_only.rs                       |  11 +
 src/array/native_atomic.rs                    |  26 +-
 src/array/native_atomic/iteration.rs          |  12 +-
 src/array/read_only.rs                        |  22 +-
 src/array/read_only/iteration.rs              |  12 +-
 src/array/unsafe.rs                           | 168 +++++++-
 src/array/unsafe/iteration/distributed.rs     |  10 +-
 src/array/unsafe/iteration/local.rs           |   6 +-
 src/darc.rs                                   | 374 +++++++++---------
 src/lamellar_task_group.rs                    |  21 +
 src/scheduler.rs                              |  10 +-
 src/scheduler/tokio.rs                        |  88 -----
 26 files changed, 778 insertions(+), 457 deletions(-)
 delete mode 100644 src/scheduler/tokio.rs

diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index 125f93dd..8f6d2545 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -897,7 +897,7 @@ fn create_buf_ops2(
             #[allow(non_snake_case)]
             fn #dist_multi_val_multi_idx_am_buf_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>, index_size: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                     Arc::new(#multi_val_multi_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         idx_vals: idx_vals,
                         index_size: index_size,
@@ -964,7 +964,7 @@ fn create_buf_ops2(
                     let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                     let val = val_slice[0];
                     Arc::new(#single_val_multi_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         val: val,
                         indices: indicies,
@@ -1001,7 +1001,7 @@ fn create_buf_ops2(
             #[allow(non_snake_case)]
             fn #dist_multi_val_single_idx_am_buf_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                     Arc::new(#multi_val_single_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         vals: vals,
                         index: index,
@@ -1070,7 +1070,7 @@ fn create_buf_ops2(
                 #[allow(non_snake_case)]
                 fn #dist_multi_val_multi_idx_am_buf_result_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>, index_size: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                         Arc::new(#multi_val_multi_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             idx_vals: idx_vals,
                             index_size: index_size,
@@ -1139,7 +1139,7 @@ fn create_buf_ops2(
                         let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                         let val = val_slice[0];
                         Arc::new(#single_val_multi_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             val: val,
                             indices: indicies,
@@ -1178,7 +1178,7 @@ fn create_buf_ops2(
                 #[allow(non_snake_case)]
                 fn #dist_multi_val_single_idx_am_buf_result_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                         Arc::new(#multi_val_single_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             vals: vals,
                             index: index,
@@ -1251,7 +1251,7 @@ fn create_buf_ops2(
         #[allow(non_snake_case)]
         fn #dist_multi_val_multi_idx_am_buf_fetch_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>,index_usize: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                 Arc::new(#multi_val_multi_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     idx_vals: idx_vals,
                     index_size: index_usize,
@@ -1320,7 +1320,7 @@ fn create_buf_ops2(
                 let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                 let val = val_slice[0];
                 Arc::new(#single_val_multi_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     val: val,
                     indices: indicies,
@@ -1363,7 +1363,7 @@ fn create_buf_ops2(
         #[allow(non_snake_case)]
         fn #dist_multi_val_single_idx_am_buf_fetch_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                 Arc::new(#multi_val_single_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     vals: vals,
                     index: index,
diff --git a/impl/src/array_reduce.rs b/impl/src/array_reduce.rs
index d059f96f..ee7629e6 100644
--- a/impl/src/array_reduce.rs
+++ b/impl/src/array_reduce.rs
@@ -49,7 +49,7 @@ fn create_reduction(
 
         gen_match_stmts.extend(quote!{
             #lamellar::array::LamellarByteArray::#array_type(inner) => std::sync::Arc::new(#reduction_name{
-                data: unsafe {inner.clone().into()} , start_pe: 0, end_pe: num_pes-1}),
+                data: unsafe {Into::into(inner.clone())} , start_pe: 0, end_pe: num_pes-1}),
         });
 
         let iter_chain = if array_type == "AtomicArray"
diff --git a/src/array.rs b/src/array.rs
index 5fe5eed4..386a2aaa 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -503,12 +503,63 @@ impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
     }
 }
 
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub(crate) trait AsyncInto<T>: Sized {
+    async fn async_into(self) -> T;
+}
+
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub(crate) trait AsyncFrom<T>: Sized {
+    async fn async_from(val: T) -> Self;
+}
+
+// AsyncFrom implies AsyncInto
+#[async_trait]
+impl<T, U> AsyncInto<U> for T
+where
+    T: Send,
+    U: AsyncFrom<T>,
+{
+    /// Calls `U::from(self).await`.
+    ///
+    /// That is, this conversion is whatever the implementation of
+    /// <code>[AsyncFrom]&lt;T&gt; for U</code> chooses to do.
+    #[inline]
+    async fn async_into(self) -> U {
+        U::async_from(self).await
+    }
+}
+
+// AsyncFrom (and thus Into) is reflexive
+// #[async_trait]
+// impl<T> AsyncFrom<T> for T
+// where
+//     T: Send,
+// {
+//     /// Returns the argument unchanged.
+//     #[inline(always)]
+//     async fn async_from(t: T) -> T {
+//         t
+//     }
+// }
+
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamFrom<T: ?Sized> {
     /// Converts to this type from the input type
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
 }
 
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> {
+    async fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
+}
+
 /// Provides the same abstraction as the `TryFrom` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamTryFrom<T: ?Sized> {
     /// Trys to convert to this type from the input type
@@ -522,6 +573,13 @@ pub trait TeamInto<T: ?Sized> {
     fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
 }
 
+/// Provides the same abstraction as the `Into` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated to be used within an async context
+#[async_trait]
+pub trait AsyncTeamInto<T: ?Sized> {
+    /// converts this type into the (usually inferred) input type
+    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
+}
+
 /// Provides the same abstraction as the `TryInto` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 
 pub trait TeamTryInto<T>: Sized {
@@ -538,6 +596,16 @@ where
     }
 }
 
+#[async_trait]
+impl<T: Send, U> AsyncTeamInto<U> for T
+where
+    U: AsyncTeamFrom<T>,
+{
+    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> U {
+        <U as AsyncTeamFrom<T>>::team_from(self, team).await
+    }
+}
+
 impl<T, U> TeamTryInto<U> for T
 where
     U: TeamTryFrom<T>,
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 646288ed..493ebd28 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1080,11 +1080,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("Converting from UnsafeArray to AtomicArray");
@@ -1096,6 +1104,18 @@ impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist + 'static> AsyncFrom<UnsafeArray<T>> for AtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("Converting from UnsafeArray to AtomicArray");
+        if NATIVE_ATOMICS.contains(&TypeId::of::<T>()) {
+            NativeAtomicArray::async_from(array).await.into()
+        } else {
+            GenericAtomicArray::async_from(array).await.into()
+        }
+    }
+}
+
 // impl<T: Dist + 'static> From<LocalOnlyArray<T>> for AtomicArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("Converting from LocalOnlyArray to AtomicArray");
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 382059a4..e051719b 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -580,11 +580,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("generic from unsafe array");
@@ -602,6 +610,26 @@ impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GenericAtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("generic from unsafe array");
+        array
+            .await_on_outstanding(DarcMode::GenericAtomicArray)
+            .await;
+        let mut vec = vec![];
+        for _i in 0..array.num_elems_local() {
+            vec.push(Mutex::new(()));
+        }
+        let locks = Darc::new(array.team_rt(), vec).unwrap();
+
+        GenericAtomicArray {
+            locks: locks,
+            array: array,
+        }
+    }
+}
+
 impl<T: Dist> From<GenericAtomicArray<T>> for GenericAtomicByteArray {
     fn from(array: GenericAtomicArray<T>) -> Self {
         GenericAtomicByteArray {
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index 6f5bfbe1..73980420 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -297,7 +297,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -311,7 +311,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -324,7 +324,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -339,7 +339,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -485,7 +485,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -499,7 +499,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 6b9ff9ef..48bf357b 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -664,11 +664,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T>
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("GlobalLock from unsafe");
@@ -682,6 +690,20 @@ impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("GlobalLock from unsafe");
+        array.await_on_outstanding(DarcMode::GlobalLockArray).await;
+        let lock = GlobalRwDarc::new(array.team_rt(), ()).unwrap();
+
+        GlobalLockArray {
+            lock: lock,
+            array: array,
+        }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for GlobalLockArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("GlobalLock from localonly");
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 70c4db61..37d5d25a 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -453,7 +453,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -467,7 +467,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -480,7 +480,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -495,7 +495,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -641,7 +641,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -655,7 +655,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 84f457b9..d41fdf1f 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -39,8 +39,8 @@ pub(crate) use consumer::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::{IterRequest, Schedule};
 use crate::array::{
-    operations::ArrayOps, AtomicArray, Distribution, GenericAtomicArray, LamellarArray,
-    LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
+    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray,
+    LamellarArray, LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
 };
 use crate::lamellar_request::LamellarRequest;
 use crate::memregion::Dist;
@@ -55,10 +55,10 @@ use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 
-#[doc(hidden)]
-pub struct DistIterForEachHandle {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
-}
+// #[doc(hidden)]
+// pub struct DistIterForEachHandle {
+//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
+// }
 
 // impl Drop for DistIterForEachHandle {
 //     fn drop(&mut self) {
@@ -66,87 +66,87 @@ pub struct DistIterForEachHandle {
 //     }
 // }
 
-#[doc(hidden)]
-#[async_trait]
-impl IterRequest for DistIterForEachHandle {
-    type Output = ();
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.into_future().await;
-        }
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.get();
-        }
-    }
-}
-
-#[doc(hidden)]
-pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
-    pub(crate) distribution: Distribution,
-    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    pub(crate) _phantom: PhantomData<A>,
-}
+// #[doc(hidden)]
+// #[async_trait]
+// impl IterRequest for DistIterForEachHandle {
+//     type Output = ();
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         for req in self.reqs.drain(..) {
+//             req.into_future().await;
+//         }
+//     }
+//     fn wait(mut self: Box<Self>) -> Self::Output {
+//         for req in self.reqs.drain(..) {
+//             req.get();
+//         }
+//     }
+// }
 
-impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> DistIterCollectHandle<T, A> {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
-        self.team.tasking_barrier();
-        let local_sizes =
-            UnsafeArray::<usize>::new(self.team.clone(), self.team.num_pes, Distribution::Block);
-        unsafe {
-            local_sizes.local_as_mut_slice()[0] = local_vals.len();
-        }
-        local_sizes.barrier();
-        // local_sizes.print();
-        let mut size = 0;
-        let mut my_start = 0;
-        let my_pe = self.team.team_pe.expect("pe not part of team");
-        // local_sizes.print();
-        unsafe {
-            local_sizes
-                .onesided_iter()
-                .into_iter()
-                .enumerate()
-                .for_each(|(i, local_size)| {
-                    size += local_size;
-                    if i < my_pe {
-                        my_start += local_size;
-                    }
-                });
-        }
-        // println!("my_start {} size {}", my_start, size);
-        let array = UnsafeArray::<T>::new(self.team.clone(), size, self.distribution); //implcit barrier
+// #[doc(hidden)]
+// pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
+//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
+//     pub(crate) distribution: Distribution,
+//     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+//     pub(crate) _phantom: PhantomData<A>,
+// }
 
-        // safe because only a single reference to array on each PE
-        // we calculate my_start so that each pes local vals are guaranteed to not overwrite another pes values.
-        unsafe { array.put(my_start, local_vals) };
-        array.into()
-    }
-}
-#[async_trait]
-impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> IterRequest
-    for DistIterCollectHandle<T, A>
-{
-    type Output = A;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        let mut local_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.into_future().await;
-            local_vals.extend(v);
-        }
-        self.create_array(&local_vals)
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        let mut local_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.get();
-            local_vals.extend(v);
-        }
-        self.create_array(&local_vals)
-    }
-}
+// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> DistIterCollectHandle<T, A> {
+//     fn create_array(&self, local_vals: &Vec<T>) -> A {
+//         self.team.tasking_barrier();
+//         let local_sizes =
+//             UnsafeArray::<usize>::new(self.team.clone(), self.team.num_pes, Distribution::Block);
+//         unsafe {
+//             local_sizes.local_as_mut_slice()[0] = local_vals.len();
+//         }
+//         local_sizes.barrier();
+//         // local_sizes.print();
+//         let mut size = 0;
+//         let mut my_start = 0;
+//         let my_pe = self.team.team_pe.expect("pe not part of team");
+//         // local_sizes.print();
+//         unsafe {
+//             local_sizes
+//                 .onesided_iter()
+//                 .into_iter()
+//                 .enumerate()
+//                 .for_each(|(i, local_size)| {
+//                     size += local_size;
+//                     if i < my_pe {
+//                         my_start += local_size;
+//                     }
+//                 });
+//         }
+//         // println!("my_start {} size {}", my_start, size);
+//         let array = UnsafeArray::<T>::new(self.team.clone(), size, self.distribution); //implcit barrier
+
+//         // safe because only a single reference to array on each PE
+//         // we calculate my_start so that each pes local vals are guaranteed to not overwrite another pes values.
+//         unsafe { array.put(my_start, local_vals) };
+//         array.into()
+//     }
+// }
+// #[async_trait]
+// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> IterRequest
+//     for DistIterCollectHandle<T, A>
+// {
+//     type Output = A;
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         let mut local_vals = vec![];
+//         for req in self.reqs.drain(0..) {
+//             let v = req.into_future().await;
+//             local_vals.extend(v);
+//         }
+//         self.create_array(&local_vals)
+//     }
+//     fn wait(mut self: Box<Self>) -> Self::Output {
+//         let mut local_vals = vec![];
+//         for req in self.reqs.drain(0..) {
+//             let v = req.get();
+//             local_vals.extend(v);
+//         }
+//         self.create_array(&local_vals)
+//     }
+// }
 
 #[doc(hidden)]
 #[enum_dispatch]
@@ -212,7 +212,7 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -223,7 +223,7 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_async<I, A, B>(
         &self,
@@ -234,7 +234,7 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_async_with_schedule<I, A, B>(
         &self,
@@ -246,7 +246,7 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
 
     fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
     where
@@ -670,7 +670,7 @@ pub trait DistributedIterator: SyncSend + Clone + 'static {
     where
         // &'static Self: DistributedIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect(self, d)
     }
@@ -716,7 +716,7 @@ pub trait DistributedIterator: SyncSend + Clone + 'static {
         // &'static Self: DistributedIterator + 'static,
         T: Dist + ArrayOps,
         Self::Item: Future<Output = T> + Send + 'static,
-        A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect_async(self, d)
     }
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index a9ec30b4..31486893 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
 use crate::array::iterator::IterRequest;
 use crate::array::operations::ArrayOps;
-use crate::array::{Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
@@ -25,7 +25,7 @@ impl<I, A> IterConsumer for Collect<I, A>
 where
     I: DistributedIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
@@ -75,7 +75,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, B)>;
     type Output = A;
@@ -118,7 +118,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     fn clone(&self) -> Self {
         CollectAsync {
@@ -132,7 +132,7 @@ where
 #[doc(hidden)]
 pub struct DistIterCollectHandle<
     T: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
     pub(crate) distribution: Distribution,
@@ -140,16 +140,23 @@ pub struct DistIterCollectHandle<
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend>
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
     DistIterCollectHandle<T, A>
 {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
+    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
         let input = (local_vals, self.distribution);
-        input.team_into(&self.team)
+        let array: A = AsyncTeamInto::team_into(input, &self.team).await;
+        array
+    }
+
+    fn create_array(&self, local_vals: Vec<T>) -> A {
+        let input = (local_vals, self.distribution);
+        let array: A = TeamInto::team_into(input, &self.team);
+        array
     }
 }
 #[async_trait]
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend> IterRequest
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
     for DistIterCollectHandle<T, A>
 {
     type Output = A;
@@ -161,7 +168,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect::<Vec<_>>();
-        self.create_array(&local_vals)
+        self.async_create_array(local_vals).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -172,7 +179,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.create_array(local_vals)
     }
 }
 
@@ -187,7 +194,7 @@ impl<I, A> LamellarAm for CollectAm<I, A>
 where
     I: DistributedIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<I::Item> {
         let iter = self.schedule.init_iter(self.iter.clone());
@@ -201,7 +208,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     pub(crate) iter: CollectAsync<I, A, B>,
     pub(crate) schedule: IterSchedule,
@@ -213,7 +220,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<(usize, B)> {
         let mut iter = self.schedule.init_iter(self.iter.clone());
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 7b177843..ff857846 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -35,7 +35,7 @@ use zip::*;
 pub(crate) use consumer::*;
 
 use crate::array::iterator::Schedule;
-use crate::array::{operations::ArrayOps, AtomicArray, Distribution, LamellarArray, TeamFrom};
+use crate::array::{operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, LamellarArray};
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
@@ -125,7 +125,7 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -136,7 +136,7 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     // fn collect_async<I, A, B>(
     //     &self,
@@ -579,7 +579,7 @@ pub trait LocalIterator: SyncSend + Clone + 'static {
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect(self, d)
     }
@@ -607,7 +607,7 @@ pub trait LocalIterator: SyncSend + Clone + 'static {
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect_with_schedule(sched, self, d)
     }
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 0aabcade..df16f948 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
 use crate::array::iterator::IterRequest;
 use crate::array::operations::ArrayOps;
-use crate::array::{Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
@@ -24,7 +24,7 @@ impl<I, A> IterConsumer for Collect<I, A>
 where
     I: LocalIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
@@ -114,7 +114,7 @@ where
 #[doc(hidden)]
 pub struct LocalIterCollectHandle<
     T: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
     pub(crate) distribution: Distribution,
@@ -122,16 +122,20 @@ pub struct LocalIterCollectHandle<
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend>
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
     LocalIterCollectHandle<T, A>
 {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
+    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
         let input = (local_vals, self.distribution);
-        input.team_into(&self.team)
+        AsyncTeamInto::team_into(input, &self.team).await
+    }
+    fn create_array(&self, local_vals: Vec<T>) -> A {
+        let input = (local_vals, self.distribution);
+        TeamInto::team_into(input, &self.team)
     }
 }
 #[async_trait]
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend> IterRequest
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
     for LocalIterCollectHandle<T, A>
 {
     type Output = A;
@@ -143,7 +147,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.async_create_array(local_vals).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -154,7 +158,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.create_array(local_vals)
     }
 }
 
@@ -169,7 +173,7 @@ impl<I, A> LamellarAm for CollectAm<I, A>
 where
     I: LocalIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<I::Item> {
         let iter = self.schedule.init_iter(self.iter.clone());
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index d897e922..f6ce6559 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -578,11 +578,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T>
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("locallock from unsafe");
@@ -596,6 +604,20 @@ impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("locallock from unsafe");
+        array.await_on_outstanding(DarcMode::LocalLockArray).await;
+        let lock = LocalRwDarc::new(array.team_rt(), ()).unwrap();
+
+        LocalLockArray {
+            lock: lock,
+            array: array,
+        }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for LocalLockArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("locallock from localonly");
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index a1d4479c..53e26ed6 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -457,7 +457,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -471,7 +471,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -484,7 +484,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -499,7 +499,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -645,7 +645,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -659,7 +659,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/local_only.rs b/src/array/local_only.rs
index 5b931ad2..258d7856 100644
--- a/src/array/local_only.rs
+++ b/src/array/local_only.rs
@@ -99,6 +99,17 @@ impl<T: Dist> From<UnsafeArray<T>> for LocalOnlyArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalOnlyArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        array.await_on_outstanding(DarcMode::LocalOnlyArray).await;
+        LocalOnlyArray {
+            array: array,
+            _unsync: PhantomData,
+        }
+    }
+}
+
 impl<T: Dist> From<ReadOnlyArray<T>> for LocalOnlyArray<T> {
     fn from(array: ReadOnlyArray<T>) -> Self {
         unsafe { array.into_inner().into() }
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 590f9b48..9fc0e785 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -995,11 +995,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 #[doc(hidden)]
 impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
@@ -1013,6 +1021,22 @@ impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     }
 }
 
+#[doc(hidden)]
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for NativeAtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("native from unsafe");
+        array
+            .await_on_outstanding(DarcMode::NativeAtomicArray)
+            .await;
+
+        NativeAtomicArray {
+            array: array,
+            orig_t: NativeAtomicType::from::<T>(),
+        }
+    }
+}
+
 #[doc(hidden)]
 impl<T: Dist> From<NativeAtomicArray<T>> for NativeAtomicByteArray {
     fn from(array: NativeAtomicArray<T>) -> Self {
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index b1775322..56caafb5 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -273,7 +273,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -287,7 +287,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -300,7 +300,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -315,7 +315,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -461,7 +461,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -475,7 +475,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 942c2fad..b1fc8be5 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -385,14 +385,22 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for ReadOnlyArray<T> {
     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
@@ -406,6 +414,16 @@ impl<T: Dist> From<UnsafeArray<T>> for ReadOnlyArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("readonly from UnsafeArray");
+        array.await_on_outstanding(DarcMode::ReadOnlyArray).await;
+
+        ReadOnlyArray { array: array }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for ReadOnlyArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("readonly from LocalOnlyArray");
diff --git a/src/array/read_only/iteration.rs b/src/array/read_only/iteration.rs
index e6f68976..af59f35a 100644
--- a/src/array/read_only/iteration.rs
+++ b/src/array/read_only/iteration.rs
@@ -116,7 +116,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -130,7 +130,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -143,7 +143,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -158,7 +158,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -304,7 +304,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -318,7 +318,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index de48d4f5..fc15981e 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -200,6 +200,76 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         // println!("after buffered ops");
         // array.inner.data.print();
     }
+
+    async fn async_new<U: Into<IntoLamellarTeam>>(
+        team: U,
+        array_size: usize,
+        distribution: Distribution,
+    ) -> UnsafeArray<T> {
+        let team = team.into().team.clone();
+        team.async_barrier().await;
+        let task_group = LamellarTaskGroup::new(team.clone());
+        let my_pe = team.team_pe_id().unwrap();
+        let num_pes = team.num_pes();
+        let full_array_size = std::cmp::max(array_size, num_pes);
+
+        let elem_per_pe = full_array_size as f64 / num_pes as f64;
+        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
+                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
+        let rmr = MemoryRegion::new(
+            per_pe_size * std::mem::size_of::<T>(),
+            team.lamellae.clone(),
+            AllocationType::Global,
+        );
+        unsafe {
+            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
+                *elem = 0;
+            }
+        }
+
+        let data = Darc::try_new_with_drop(
+            team.clone(),
+            UnsafeArrayData {
+                mem_region: rmr,
+                array_counters: Arc::new(AMCounters::new()),
+                team: team.clone(),
+                task_group: Arc::new(task_group),
+                my_pe: my_pe,
+                num_pes: num_pes,
+                req_cnt: Arc::new(AtomicUsize::new(0)),
+            },
+            crate::darc::DarcMode::UnsafeArray,
+            None,
+        )
+        .expect("trying to create array on non team member");
+        let array = UnsafeArray {
+            inner: UnsafeArrayInner {
+                data: data,
+                distribution: distribution.clone(),
+                // wait: wait,
+                orig_elem_per_pe: elem_per_pe,
+                elem_size: std::mem::size_of::<T>(),
+                offset: 0,             //relative to size of T
+                size: full_array_size, //relative to size of T
+            },
+            phantom: PhantomData,
+        };
+        // println!("new unsafe");
+        // unsafe {println!("size {:?} bytes {:?}",array.inner.size, array.inner.data.mem_region.as_mut_slice().unwrap().len())};
+        // println!("elem per pe {:?}", elem_per_pe);
+        // for i in 0..num_pes{
+        //     println!("pe: {:?} {:?}",i,array.inner.num_elems_pe(i));
+        // }
+        // array.inner.data.print();
+        if full_array_size != array_size {
+            println!("WARNING: Array size {array_size} is less than number of pes {full_array_size}, each PE will not contain data");
+            array.sub_array(0..array_size)
+        } else {
+            array
+        }
+        // println!("after buffered ops");
+        // array.inner.data.print();
+    }
 }
 impl<T: Dist + 'static> UnsafeArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -361,6 +431,47 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.inner.data.team.clone()
     }
 
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        // let mut first = true;
+        while self
+            .inner
+            .data
+            .array_counters
+            .outstanding_reqs
+            .load(Ordering::SeqCst)
+            > 0
+            || self.inner.data.req_cnt.load(Ordering::SeqCst) > 0
+        {
+            // std::thread::yield_now();
+            // self.inner.data.team.flush();
+            // self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                //|| first{
+                println!(
+                    "in array await_all mype: {:?} cnt: {:?} {:?} {:?}",
+                    self.inner.data.team.world_pe,
+                    self.inner
+                        .data
+                        .array_counters
+                        .send_req_cnt
+                        .load(Ordering::SeqCst),
+                    self.inner
+                        .data
+                        .array_counters
+                        .outstanding_reqs
+                        .load(Ordering::SeqCst),
+                    self.inner.data.req_cnt.load(Ordering::SeqCst)
+                );
+                temp_now = Instant::now();
+                // first = false;
+            }
+        }
+        self.inner.data.task_group.await_all().await;
+        // println!("done in wait all {:?}",std::time::SystemTime::now());
+    }
+
     pub(crate) fn block_on_outstanding(&self, mode: DarcMode) {
         self.wait_all();
         // println!("block on outstanding");
@@ -371,6 +482,15 @@ impl<T: Dist + 'static> UnsafeArray<T> {
             .block_on(array_darc.block_on_outstanding(mode, 1)); //one for this instance of the array
     }
 
+    pub(crate) async fn await_on_outstanding(&self, mode: DarcMode) {
+        self.await_all().await;
+        // println!("block on outstanding");
+        // self.inner.data.print();
+        // let the_array: UnsafeArray<T> = self.clone();
+        let array_darc = self.inner.data.clone();
+        array_darc.block_on_outstanding(mode, 1).await;
+    }
+
     #[doc(alias = "Collective")]
     /// Convert this UnsafeArray into a (safe) [ReadOnlyArray][crate::array::ReadOnlyArray]
     ///
@@ -570,7 +690,44 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        input.team_into(team)
+        TeamInto::team_into(input, team)
+    }
+}
+
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let (local_vals, distribution) = input;
+        // println!("local_vals len: {:?}", local_vals.len());
+        team.async_barrier().await;
+        let local_sizes =
+            UnsafeArray::<usize>::async_new(team.clone(), team.num_pes, Distribution::Block).await;
+        unsafe {
+            local_sizes.local_as_mut_slice()[0] = local_vals.len();
+        }
+        team.async_barrier().await;
+        // local_sizes.barrier();
+        let mut size = 0;
+        let mut my_start = 0;
+        let my_pe = team.team_pe.expect("pe not part of team");
+        unsafe {
+            local_sizes
+                .buffered_onesided_iter(team.num_pes)
+                .into_iter()
+                .enumerate()
+                .for_each(|(i, local_size)| {
+                    size += local_size;
+                    if i < my_pe {
+                        my_start += local_size;
+                    }
+                });
+        }
+        let array = UnsafeArray::<T>::async_new(team.clone(), size, distribution).await;
+        if local_vals.len() > 0 {
+            unsafe { array.put(my_start, local_vals).await };
+        }
+        team.async_barrier().await;
+        array
     }
 }
 
@@ -611,8 +768,6 @@ impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
 
 impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
     fn from(array: AtomicArray<T>) -> Self {
-        // println!("unsafe from atomic");
-        // array.into_unsafe()
         match array {
             AtomicArray::NativeAtomicArray(array) => UnsafeArray::<T>::from(array),
             AtomicArray::GenericAtomicArray(array) => UnsafeArray::<T>::from(array),
@@ -622,8 +777,6 @@ impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<NativeAtomicArray<T>> for UnsafeArray<T> {
     fn from(array: NativeAtomicArray<T>) -> Self {
-        // println!("unsafe from native atomic");
-        // let array = array.into_data();
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -631,8 +784,6 @@ impl<T: Dist> From<NativeAtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<GenericAtomicArray<T>> for UnsafeArray<T> {
     fn from(array: GenericAtomicArray<T>) -> Self {
-        // println!("unsafe from generic atomic");
-        // let array = array.into_data();
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -640,7 +791,6 @@ impl<T: Dist> From<GenericAtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<LocalLockArray<T>> for UnsafeArray<T> {
     fn from(array: LocalLockArray<T>) -> Self {
-        // println!("unsafe from local lock atomic");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -648,7 +798,6 @@ impl<T: Dist> From<LocalLockArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<GlobalLockArray<T>> for UnsafeArray<T> {
     fn from(array: GlobalLockArray<T>) -> Self {
-        // println!("unsafe from global lock atomic");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -656,7 +805,6 @@ impl<T: Dist> From<GlobalLockArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<ReadOnlyArray<T>> for UnsafeArray<T> {
     fn from(array: ReadOnlyArray<T>) -> Self {
-        // println!("unsafe from read only");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index a677969a..4a9668a0 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, Distribution, LamellarArray, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, AsyncTeamInto, Distribution, LamellarArray, TeamFrom};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -141,7 +141,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_with_schedule(Schedule::Static, iter, d)
     }
@@ -155,7 +155,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = Collect {
             iter: iter.clone().monotonic(),
@@ -180,7 +180,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_async_with_schedule(Schedule::Static, iter, d)
     }
@@ -195,7 +195,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = CollectAsync {
             iter: iter.clone().monotonic(),
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 1ad136ee..16151573 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::local_iterator::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, Distribution, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -162,7 +162,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_with_schedule(Schedule::Static, iter, d)
     }
@@ -176,7 +176,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = Collect {
             iter: iter.clone().monotonic(),
diff --git a/src/darc.rs b/src/darc.rs
index b2d57d70..817aec31 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -416,204 +416,213 @@ impl<T> DarcInner<T> {
     // }
 
     async fn block_on_outstanding(inner: WrappedInner<T>, state: DarcMode, extra_cnt: usize) {
-        let mut outstanding_refs = true;
         let team = inner.team();
-        let mode_refs =
-            unsafe { std::slice::from_raw_parts_mut(inner.mode_addr as *mut u8, inner.num_pes) };
-        let mut prev_ref_cnts = vec![0usize; inner.num_pes];
-        let mut barrier_id = 1usize;
+        if team.num_pes() == 1 {
+            while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
+                async_std::task::yield_now().await;
+            }
+        } else {
+            let mut outstanding_refs = true;
+            let mode_refs = unsafe {
+                std::slice::from_raw_parts_mut(inner.mode_addr as *mut u8, inner.num_pes)
+            };
+            let mut prev_ref_cnts = vec![0usize; inner.num_pes];
+            let mut barrier_id = 1usize;
 
-        let barrier_ref_cnt_slice = unsafe {
-            std::slice::from_raw_parts_mut(inner.mode_ref_cnt_addr as *mut usize, inner.num_pes)
-        };
-        let barrier_slice = unsafe {
-            std::slice::from_raw_parts_mut(inner.mode_barrier_addr as *mut usize, inner.num_pes)
-        };
+            let barrier_ref_cnt_slice = unsafe {
+                std::slice::from_raw_parts_mut(inner.mode_ref_cnt_addr as *mut usize, inner.num_pes)
+            };
+            let barrier_slice = unsafe {
+                std::slice::from_raw_parts_mut(inner.mode_barrier_addr as *mut usize, inner.num_pes)
+            };
 
-        let ref_cnts_slice = unsafe {
-            std::slice::from_raw_parts_mut(inner.total_ref_cnt_addr as *mut usize, inner.num_pes)
-        };
+            let ref_cnts_slice = unsafe {
+                std::slice::from_raw_parts_mut(
+                    inner.total_ref_cnt_addr as *mut usize,
+                    inner.num_pes,
+                )
+            };
 
-        // let rel_addr = inner.inner.as_ptr() as *const _ as usize - team.lamellae.base_addr();
+            // let rel_addr = inner.inner.as_ptr() as *const _ as usize - team.lamellae.base_addr();
 
-        // println!(
-        //     "[{:?}] entering initial block_on barrier()",
-        //     std::thread::current().id()
-        // );
-        let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
-        barrier_fut.await;
-        // println!(
-        //     "[{:?}] leaving initial block_on barrier()",
-        //     std::thread::current().id()
-        // );
+            // println!(
+            //     "[{:?}] entering initial block_on barrier()",
+            //     std::thread::current().id()
+            // );
+            let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
+            barrier_fut.await;
+            // println!(
+            //     "[{:?}] leaving initial block_on barrier()",
+            //     std::thread::current().id()
+            // );
 
-        while outstanding_refs {
-            outstanding_refs = false;
-            let old_barrier_id = barrier_id; //we potentially will set barrier_id to 0 but want to maintiain the previously highest value
-            while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
-                async_std::task::yield_now().await;
-            }
-            inner.send_finished();
+            while outstanding_refs {
+                outstanding_refs = false;
+                let old_barrier_id = barrier_id; //we potentially will set barrier_id to 0 but want to maintiain the previously highest value
+                while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
+                    async_std::task::yield_now().await;
+                }
+                inner.send_finished();
 
-            let mut old_ref_cnts = ref_cnts_slice.to_vec();
-            let old_local_cnt = inner.total_local_cnt.load(Ordering::SeqCst);
-            let old_dist_cnt = inner.total_dist_cnt.load(Ordering::SeqCst);
+                let mut old_ref_cnts = ref_cnts_slice.to_vec();
+                let old_local_cnt = inner.total_local_cnt.load(Ordering::SeqCst);
+                let old_dist_cnt = inner.total_dist_cnt.load(Ordering::SeqCst);
 
-            let rdma = &team.lamellae;
-            // let mut dist_cnts_changed = false;
-            for pe in 0..inner.num_pes {
-                let ref_cnt_u8 = unsafe {
-                    std::slice::from_raw_parts_mut(
-                        &mut old_ref_cnts[pe] as *mut usize as *mut u8,
-                        std::mem::size_of::<usize>(),
-                    )
-                };
-                if prev_ref_cnts[pe] != old_ref_cnts[pe] {
+                let rdma = &team.lamellae;
+                // let mut dist_cnts_changed = false;
+                for pe in 0..inner.num_pes {
+                    let ref_cnt_u8 = unsafe {
+                        std::slice::from_raw_parts_mut(
+                            &mut old_ref_cnts[pe] as *mut usize as *mut u8,
+                            std::mem::size_of::<usize>(),
+                        )
+                    };
+                    if prev_ref_cnts[pe] != old_ref_cnts[pe] {
+                        let send_pe = team.arch.single_iter(pe).next().unwrap();
+                        // println!(
+                        //     "[{:?}] {rel_addr:x} sending {:?} to pe {:?} at {:x} + {:?} ({:x}) ",
+                        //     std::thread::current().id(),
+                        //     old_ref_cnts[pe],
+                        //     pe,
+                        //     inner.mode_ref_cnt_addr,
+                        //     inner.my_pe * std::mem::size_of::<usize>(),
+                        //     inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>()
+                        // );
+                        rdma.put(
+                            send_pe,
+                            ref_cnt_u8,
+                            inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>(), //this is barrier_ref_cnt_slice
+                        );
+                        // dist_cnts_changed = true;
+                        outstanding_refs = true;
+                        barrier_id = 0;
+                    }
+                }
+                rdma.flush();
+                let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
+                barrier_fut.await;
+                outstanding_refs |= old_local_cnt != inner.total_local_cnt.load(Ordering::SeqCst);
+                // if outstanding_refs {
+                //     println!(
+                //         "[{:?}] {rel_addr:x}  total local cnt changed",
+                //         std::thread::current().id()
+                //     );
+                // }
+                outstanding_refs |= old_dist_cnt != inner.total_dist_cnt.load(Ordering::SeqCst);
+                // if outstanding_refs {
+                //     println!(
+                //         "[{:?}] {rel_addr:x}  total dist cnt changed",
+                //         std::thread::current().id()
+                //     );
+                // }
+
+                let mut barrier_sum = 0;
+                for pe in 0..inner.num_pes {
+                    outstanding_refs |= old_ref_cnts[pe] != ref_cnts_slice[pe];
+                    // if outstanding_refs {
+                    //     println!(
+                    //         "[{:?}] {rel_addr:x}  refs changed for pe {pe}",
+                    //         std::thread::current().id()
+                    //     );
+                    // }
+                    // dist_cnts_changed |= old_ref_cnts[pe] != ref_cnts_slice[pe];
+                    barrier_sum += barrier_ref_cnt_slice[pe];
+                }
+                outstanding_refs |= barrier_sum != old_dist_cnt;
+                // if outstanding_refs {
+                //     println!(
+                //         "[{:?}] {rel_addr:x}  sum of cnts != dist ref cnt {:?} {:?}",
+                //         std::thread::current().id(),
+                //         barrier_ref_cnt_slice,
+                //         old_ref_cnts
+                //     );
+                // }
+                if outstanding_refs {
+                    // println!("reseting barrier_id");
+                    barrier_id = 0;
+                }
+                rdma.flush();
+                let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
+                barrier_fut.await;
+
+                for pe in 0..inner.num_pes {
                     let send_pe = team.arch.single_iter(pe).next().unwrap();
                     // println!(
-                    //     "[{:?}] {rel_addr:x} sending {:?} to pe {:?} at {:x} + {:?} ({:x}) ",
+                    //     "[{:?}] {rel_addr:x} sending {barrier_id} ({barrier_id_slice:?}) to pe {pe} ",
                     //     std::thread::current().id(),
-                    //     old_ref_cnts[pe],
-                    //     pe,
-                    //     inner.mode_ref_cnt_addr,
-                    //     inner.my_pe * std::mem::size_of::<usize>(),
-                    //     inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>()
                     // );
+                    let barrier_id_slice = unsafe {
+                        std::slice::from_raw_parts_mut(
+                            &mut barrier_id as *mut usize as *mut u8,
+                            std::mem::size_of::<usize>(),
+                        )
+                    };
                     rdma.put(
                         send_pe,
-                        ref_cnt_u8,
-                        inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>(), //this is barrier_ref_cnt_slice
+                        barrier_id_slice,
+                        inner.mode_barrier_addr + inner.my_pe * std::mem::size_of::<usize>(),
                     );
-                    // dist_cnts_changed = true;
-                    outstanding_refs = true;
-                    barrier_id = 0;
                 }
-            }
-            rdma.flush();
-            let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
-            barrier_fut.await;
-            outstanding_refs |= old_local_cnt != inner.total_local_cnt.load(Ordering::SeqCst);
-            // if outstanding_refs {
-            //     println!(
-            //         "[{:?}] {rel_addr:x}  total local cnt changed",
-            //         std::thread::current().id()
-            //     );
-            // }
-            outstanding_refs |= old_dist_cnt != inner.total_dist_cnt.load(Ordering::SeqCst);
-            // if outstanding_refs {
-            //     println!(
-            //         "[{:?}] {rel_addr:x}  total dist cnt changed",
-            //         std::thread::current().id()
-            //     );
-            // }
-
-            let mut barrier_sum = 0;
-            for pe in 0..inner.num_pes {
-                outstanding_refs |= old_ref_cnts[pe] != ref_cnts_slice[pe];
+                rdma.flush();
+                let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
+                barrier_fut.await;
+                for id in &*barrier_slice {
+                    outstanding_refs |= *id == 0;
+                }
                 // if outstanding_refs {
-                //     println!(
-                //         "[{:?}] {rel_addr:x}  refs changed for pe {pe}",
-                //         std::thread::current().id()
-                //     );
+                //     println!("[{:?}] {rel_addr:x}  not all pes ready mode_refs: {mode_refs:?} prev_ref_cnts: {prev_ref_cnts:?} barrier_id: {barrier_id:?} barrier_id_slice: {barrier_id_slice:?} barrier_ref_cnt_slice: {barrier_ref_cnt_slice:?}
+                //     barrier_slice: {barrier_slice:?} ref_cnts_slice: {ref_cnts_slice:?} old_ref_cnts: {old_ref_cnts:?} old_local_cnt: {old_local_cnt:?} local_cnt: {:?} old_dist_cnt: {old_dist_cnt:?} dist_cnt: {:?}
+                //     dist_cnts_changed: {dist_cnts_changed:?} barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(),inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
                 // }
-                // dist_cnts_changed |= old_ref_cnts[pe] != ref_cnts_slice[pe];
-                barrier_sum += barrier_ref_cnt_slice[pe];
-            }
-            outstanding_refs |= barrier_sum != old_dist_cnt;
-            // if outstanding_refs {
-            //     println!(
-            //         "[{:?}] {rel_addr:x}  sum of cnts != dist ref cnt {:?} {:?}",
-            //         std::thread::current().id(),
-            //         barrier_ref_cnt_slice,
-            //         old_ref_cnts
-            //     );
-            // }
-            if outstanding_refs {
-                // println!("reseting barrier_id");
-                barrier_id = 0;
+                // if dist_cnts_changed || !outstanding_refs {
+                //     println!("[{:?}] {rel_addr:x}  mode_refs: {mode_refs:?} prev_ref_cnts: {prev_ref_cnts:?} barrier_id: {barrier_id:?} barrier_id_slice: {barrier_id_slice:?} barrier_ref_cnt_slice: {barrier_ref_cnt_slice:?}
+                //     barrier_slice: {barrier_slice:?} ref_cnts_slice: {ref_cnts_slice:?} old_ref_cnts: {old_ref_cnts:?} old_local_cnt: {old_local_cnt:?} local_cnt: {:?} old_dist_cnt: {old_dist_cnt:?} dist_cnt: {:?}
+                //     dist_cnts_changed: {dist_cnts_changed:?} barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(), inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
+                // }
+                barrier_id = old_barrier_id + 1;
+                if outstanding_refs {
+                    // println!(
+                    //     "[{:?}] still outstanding, exec a task!",
+                    //     std::thread::current().id()
+                    // );
+                    // team.scheduler.exec_task();
+                    async_std::task::yield_now().await;
+                }
+                prev_ref_cnts = old_ref_cnts;
             }
-            rdma.flush();
-            let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
-            barrier_fut.await;
+            // println!(
+            //     "[{:?}] {rel_addr:x}  all outstanding refs are resolved",
+            //     std::thread::current().id()
+            // );
+            // inner.debug_print();
+            // println!("[{:?}] {:?}", std::thread::current().id(), inner);
 
-            for pe in 0..inner.num_pes {
-                let send_pe = team.arch.single_iter(pe).next().unwrap();
-                // println!(
-                //     "[{:?}] {rel_addr:x} sending {barrier_id} ({barrier_id_slice:?}) to pe {pe} ",
-                //     std::thread::current().id(),
-                // );
-                let barrier_id_slice = unsafe {
-                    std::slice::from_raw_parts_mut(
-                        &mut barrier_id as *mut usize as *mut u8,
-                        std::mem::size_of::<usize>(),
-                    )
-                };
+            unsafe {
+                (*(((&mut mode_refs[inner.my_pe]) as *mut u8) as *mut AtomicU8)) //this should be fine given that DarcMode uses Repr(u8)
+                    .store(state as u8, Ordering::SeqCst)
+            };
+            let rdma = &team.lamellae;
+            for pe in team.arch.team_iter() {
                 rdma.put(
-                    send_pe,
-                    barrier_id_slice,
-                    inner.mode_barrier_addr + inner.my_pe * std::mem::size_of::<usize>(),
+                    pe,
+                    &mode_refs[inner.my_pe..=inner.my_pe],
+                    inner.mode_addr + inner.my_pe * std::mem::size_of::<DarcMode>(),
                 );
             }
-            rdma.flush();
-            let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
-            barrier_fut.await;
-            for id in &*barrier_slice {
-                outstanding_refs |= *id == 0;
-            }
-            // if outstanding_refs {
-            //     println!("[{:?}] {rel_addr:x}  not all pes ready mode_refs: {mode_refs:?} prev_ref_cnts: {prev_ref_cnts:?} barrier_id: {barrier_id:?} barrier_id_slice: {barrier_id_slice:?} barrier_ref_cnt_slice: {barrier_ref_cnt_slice:?}
-            //     barrier_slice: {barrier_slice:?} ref_cnts_slice: {ref_cnts_slice:?} old_ref_cnts: {old_ref_cnts:?} old_local_cnt: {old_local_cnt:?} local_cnt: {:?} old_dist_cnt: {old_dist_cnt:?} dist_cnt: {:?}
-            //     dist_cnts_changed: {dist_cnts_changed:?} barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(),inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
-            // }
-            // if dist_cnts_changed || !outstanding_refs {
-            //     println!("[{:?}] {rel_addr:x}  mode_refs: {mode_refs:?} prev_ref_cnts: {prev_ref_cnts:?} barrier_id: {barrier_id:?} barrier_id_slice: {barrier_id_slice:?} barrier_ref_cnt_slice: {barrier_ref_cnt_slice:?}
-            //     barrier_slice: {barrier_slice:?} ref_cnts_slice: {ref_cnts_slice:?} old_ref_cnts: {old_ref_cnts:?} old_local_cnt: {old_local_cnt:?} local_cnt: {:?} old_dist_cnt: {old_dist_cnt:?} dist_cnt: {:?}
-            //     dist_cnts_changed: {dist_cnts_changed:?} barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(), inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
-            // }
-            barrier_id = old_barrier_id + 1;
-            if outstanding_refs {
-                // println!(
-                //     "[{:?}] still outstanding, exec a task!",
-                //     std::thread::current().id()
-                // );
-                // team.scheduler.exec_task();
-                async_std::task::yield_now().await;
-            }
-            prev_ref_cnts = old_ref_cnts;
-        }
-        // println!(
-        //     "[{:?}] {rel_addr:x}  all outstanding refs are resolved",
-        //     std::thread::current().id()
-        // );
-        // inner.debug_print();
-        // println!("[{:?}] {:?}", std::thread::current().id(), inner);
-
-        unsafe {
-            (*(((&mut mode_refs[inner.my_pe]) as *mut u8) as *mut AtomicU8)) //this should be fine given that DarcMode uses Repr(u8)
-                .store(state as u8, Ordering::SeqCst)
-        };
-        let rdma = &team.lamellae;
-        for pe in team.arch.team_iter() {
-            rdma.put(
-                pe,
-                &mode_refs[inner.my_pe..=inner.my_pe],
-                inner.mode_addr + inner.my_pe * std::mem::size_of::<DarcMode>(),
-            );
-        }
-        for pe in mode_refs.iter() {
-            let mut timer = std::time::Instant::now();
-            while *pe != state as u8 {
-                if inner.local_cnt.load(Ordering::SeqCst) == 1 + extra_cnt {
-                    inner.send_finished();
-                }
-                if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
-                    let ref_cnts_slice = unsafe {
-                        std::slice::from_raw_parts_mut(
-                            inner.ref_cnt_addr as *mut usize,
-                            inner.num_pes,
-                        )
-                    };
-                    println!("[{:?}][WARNING] -- Potential deadlock detected.\n\
+            for pe in mode_refs.iter() {
+                let mut timer = std::time::Instant::now();
+                while *pe != state as u8 {
+                    if inner.local_cnt.load(Ordering::SeqCst) == 1 + extra_cnt {
+                        inner.send_finished();
+                    }
+                    if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                        let ref_cnts_slice = unsafe {
+                            std::slice::from_raw_parts_mut(
+                                inner.ref_cnt_addr as *mut usize,
+                                inner.num_pes,
+                            )
+                        };
+                        println!("[{:?}][WARNING] -- Potential deadlock detected.\n\
                     The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
                     The object is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
                     An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
@@ -629,16 +638,17 @@ impl<T> DarcInner<T> {
                     *crate::DEADLOCK_TIMEOUT,
                     std::backtrace::Backtrace::capture()
                 );
-                    timer = std::time::Instant::now();
+                        timer = std::time::Instant::now();
+                    }
+                    async_std::task::yield_now().await;
                 }
-                async_std::task::yield_now().await;
             }
-        }
 
-        // self.debug_print();
-        // println!("{rel_addr:x}  {:?}", self);
-        let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
-        barrier_fut.await;
+            // self.debug_print();
+            // println!("{rel_addr:x}  {:?}", self);
+            let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
+            barrier_fut.await;
+        }
 
         // self.debug_print();
     }
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 26e45ec0..d699370c 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -552,6 +552,27 @@ impl LamellarTaskGroup {
         }
     }
 
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
+            // self.team.flush();
+            // self.team.scheduler.exec_task();
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                println!(
+                    "in task group wait_all mype: {:?} cnt: {:?} {:?}",
+                    self.team.world_pe,
+                    self.team.team_counters.send_req_cnt.load(Ordering::SeqCst),
+                    self.team
+                        .team_counters
+                        .outstanding_reqs
+                        .load(Ordering::SeqCst),
+                );
+                temp_now = Instant::now();
+            }
+        }
+    }
+
     pub(crate) fn exec_am_all_inner<F>(
         &self,
         am: F,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 97d85179..0ba82e24 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -14,9 +14,9 @@ pub(crate) mod work_stealing;
 use work_stealing::WorkStealing;
 
 #[cfg(feature = "tokio-executor")]
-pub(crate) mod tokio;
+pub(crate) mod tokio_executor;
 #[cfg(feature = "tokio-executor")]
-use tokio::TokioRt;
+use tokio_executor::TokioRt;
 
 // ACTIVE ENUM
 // since atomic enums would be another dependecy
@@ -223,6 +223,12 @@ impl Scheduler {
     }
 
     pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            println!(
+                "trying to call block on within a worker thread {:?}",
+                std::backtrace::Backtrace::capture()
+            )
+        }
         self.executor.block_on(task)
     }
 
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
deleted file mode 100644
index f9e14ac1..00000000
--- a/src/scheduler/tokio.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
-
-use tokio::runtime::Runtime;
-
-use tracing::*;
-
-use async_task::{Builder, Runnable};
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-use rand::prelude::*;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-
-static TASK_ID: AtomicUsize = AtomicUsize::new(0);
-
-#[derive(Debug)]
-pub(crate) struct TokioRt {
-    max_num_threads: usize,
-    rt: Runtime,
-}
-
-impl LamellarExecutor for TokioRt {
-    fn submit_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn submit_immediate_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn block_on<F: Future>(&self, task: F) -> F::Output {
-        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn force_shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn exec_task(&self) {
-        // I dont think tokio has a way to do this
-    }
-
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
-
-    fn num_workers(&self) -> usize {
-        self.max_num_threads
-    }
-}
-
-impl TokioRt {
-    pub(crate) fn new(num_workers: usize) -> TokioRt {
-        // println!("New TokioRT with {} workers", num_workers);
-        TokioRt {
-            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
-            rt: tokio::runtime::Builder::new_multi_thread()
-                .worker_threads(num_workers + 1)
-                .enable_all()
-                .build()
-                .unwrap(),
-        }
-    }
-}

From d189573e453120a247dcfe0b7e8aa97b3038c785 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Thu, 8 Feb 2024 10:49:15 -0800
Subject: [PATCH 003/116] renaming tokio.rs -> tokio_executor.rs

---
 src/scheduler/tokio_executor.rs | 88 +++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 src/scheduler/tokio_executor.rs

diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
new file mode 100644
index 00000000..f9e14ac1
--- /dev/null
+++ b/src/scheduler/tokio_executor.rs
@@ -0,0 +1,88 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+
+use tokio::runtime::Runtime;
+
+use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::Worker;
+use futures::Future;
+use futures_lite::FutureExt;
+use rand::prelude::*;
+use std::panic;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc; //, Weak};
+use std::thread;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+pub(crate) struct TokioRt {
+    max_num_threads: usize,
+    rt: Runtime,
+}
+
+impl LamellarExecutor for TokioRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl TokioRt {
+    pub(crate) fn new(num_workers: usize) -> TokioRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        TokioRt {
+            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            rt: tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(num_workers + 1)
+                .enable_all()
+                .build()
+                .unwrap(),
+        }
+    }
+}

From 31ee58f5d7d9ad9154050ddae215602b61ddbc09 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 26 Jan 2024 22:05:55 -0800
Subject: [PATCH 004/116] refactoring to suppport different executor backends +
 tokio backend

---
 Cargo.toml                                    |   4 +-
 .../async_comparison.rs                       |   8 +-
 examples/array_examples/array_put_get.rs      |   2 +-
 examples/array_examples/global_lock_array.rs  |   6 +-
 .../global_lock_atomic_array_put_bw.rs        |   3 +-
 .../local_lock_atomic_array_put_bw.rs         |   4 +-
 .../bandwidths/task_group_futures_am_bw.rs    |   2 +-
 examples/darc_examples/darc.rs                |   6 +-
 examples/darc_examples/string_darc.rs         |  29 +-
 examples/kernels/dft_proxy.rs                 |  12 +-
 .../safe_parallel_blocked_array_gemm.rs       |   2 +-
 src/active_messaging.rs                       |  90 ++-
 src/active_messaging/batching.rs              |  80 ++-
 .../batching/simple_batcher.rs                | 195 +++---
 .../batching/team_am_batcher.rs               | 331 +++++-----
 .../registered_active_message.rs              | 100 ++-
 src/array.rs                                  |  34 +-
 src/array/atomic.rs                           |  10 +-
 src/array/generic_atomic.rs                   |  15 +-
 src/array/global_lock_atomic.rs               | 362 +++++------
 src/array/global_lock_atomic/iteration.rs     |  15 +-
 .../distributed_iterator/consumer/count.rs    |   4 +-
 .../distributed_iterator/consumer/reduce.rs   |   2 +-
 .../iterator/one_sided_iterator/buffered.rs   |   4 -
 src/array/local_lock_atomic.rs                | 354 ++++-------
 src/array/local_lock_atomic/iteration.rs      |  21 +-
 src/array/native_atomic.rs                    |  15 +-
 src/array/operations.rs                       |  69 +--
 src/array/read_only.rs                        |  15 +-
 src/array/unsafe.rs                           |  21 +-
 src/array/unsafe/operations.rs                | 103 ++--
 src/barrier.rs                                |  29 +-
 src/darc.rs                                   |  28 +-
 src/darc/global_rw_darc.rs                    | 407 ++++++-------
 src/darc/local_rw_darc.rs                     | 337 +++++------
 src/lamellae/command_queues.rs                |   5 +-
 src/lamellae/rofi_lamellae.rs                 |   2 +-
 src/lamellae/shmem_lamellae.rs                |   2 +-
 src/lamellar_request.rs                       |   2 +-
 src/lamellar_task_group.rs                    |   7 +-
 src/lamellar_team.rs                          |  12 +-
 src/lamellar_world.rs                         |  65 +-
 src/lib.rs                                    |   2 +-
 src/scheduler.rs                              | 337 ++++++++---
 src/scheduler/numa_work_stealing.rs           |   7 +-
 src/scheduler/numa_work_stealing2.rs          |   2 +-
 src/scheduler/tokio.rs                        |  88 +++
 src/scheduler/work_stealing.rs                | 567 ++----------------
 tests/array/arithmetic_ops/add_test.rs        |   4 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |   4 +-
 50 files changed, 1681 insertions(+), 2144 deletions(-)
 create mode 100644 src/scheduler/tokio.rs

diff --git a/Cargo.toml b/Cargo.toml
index 4401c70f..022c8fb1 100755
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,6 +53,7 @@ itertools = "0.10.5"
 serde_with = "3.0.0"
 pin-weak = "1.1.0"
 async-lock = "2.8.0"
+tokio = { version = "1.35.1", features = ["full"] , optional = true}
 
 
 [dev-dependencies]
@@ -74,13 +75,14 @@ members = ["impl"]
 #features are strictly additive.... can't have mutual exclusitivity
 [features]
 enable-rofi=["rofisys", "libc"]
+tokio-executor=["tokio"]
 slurm-test=[]
 default=[]
 
 
 [profile.release]
 opt-level=3
-lto=true
+lto=false
 codegen-units=1
 debug = true   
 
diff --git a/examples/active_message_examples/async_comparison.rs b/examples/active_message_examples/async_comparison.rs
index cd97397b..510e68cc 100644
--- a/examples/active_message_examples/async_comparison.rs
+++ b/examples/active_message_examples/async_comparison.rs
@@ -93,7 +93,9 @@ fn main() {
         for _i in 0..10 {
             std_am_group.add_am_all(std_am.clone()); //launch multiple tasks asyncronously
         }
-        world.block_on(std_am_group.exec());
+        world.block_on(async move {
+            std_am_group.exec().await;
+        });
         println!(
             "time for std sleep tasks: {:?}",
             timer.elapsed().as_secs_f64()
@@ -106,7 +108,9 @@ fn main() {
         for _i in 0..10 {
             async_am_group.add_am_all(async_am.clone()); //launch multiple tasks asyncronously
         }
-        world.block_on(async_am_group.exec());
+        world.block_on(async move {
+            async_am_group.exec().await;
+        });
         println!(
             "time for async sleep tasks: {:?}",
             timer.elapsed().as_secs_f64()
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index fceb7ec5..d162d171 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -20,7 +20,7 @@ fn initialize_mem_region(memregion: &LamellarMemoryRegion<usize>) {
 fn main() {
     let args: Vec<String> = std::env::args().collect();
     let world = lamellar::LamellarWorldBuilder::new().build();
-    world.block_on(async {
+    world.clone().block_on(async move {
         let _num_pes = world.num_pes();
         let my_pe = world.my_pe();
         let total_len = args
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 81c0420c..8b904396 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -9,7 +9,7 @@ fn main() {
     let array = GlobalLockArray::<usize>::new(&world, 100, Distribution::Block);
 
     let s = Instant::now();
-    let local_data = array.block_on(array.read_local_data());
+    let local_data = array.blocking_read_local_data();
     println!(
         "PE{my_pe} time: {:?} {:?}",
         s.elapsed().as_secs_f64(),
@@ -19,7 +19,7 @@ fn main() {
     drop(local_data); //release the lock
 
     world.barrier();
-    let mut local_data = array.block_on(array.write_local_data());
+    let mut local_data = array.blocking_write_local_data();
     println!(
         "PE{my_pe} time: {:?} got write lock",
         s.elapsed().as_secs_f64()
@@ -31,7 +31,7 @@ fn main() {
     array.print();
     println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
-    let mut local_data = array.block_on(array.collective_write_local_data());
+    let mut local_data = array.blocking_collective_write_local_data();
     println!(
         "PE{my_pe} time: {:?} got collective write lock",
         s.elapsed().as_secs_f64()
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index 7c185123..fa3f257e 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -66,9 +66,8 @@ fn main() {
         array.barrier();
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == 0 {
-            // let array_slice = unsafe { array.read_local_data() }; //unlike for unsafe arrays, accessing the local data captures a read lock, this would prevent any writes from happening.
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.block_on(array.read_local_data());
+                let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 3ebefdfb..18fa1078 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -66,9 +66,9 @@ fn main() {
         array.barrier();
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == num_pes - 1 {
-            // let array_slice = unsafe { array.read_local_data() }; //unlike for unsafe arrays, accessing the local data captures a read lock, this would prevent any writes from happening.
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.block_on(array.read_local_data());
+                let array_clone = array.clone();
+                let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/bandwidths/task_group_futures_am_bw.rs b/examples/bandwidths/task_group_futures_am_bw.rs
index 2dce157e..d07d0abc 100644
--- a/examples/bandwidths/task_group_futures_am_bw.rs
+++ b/examples/bandwidths/task_group_futures_am_bw.rs
@@ -64,7 +64,7 @@ fn main() {
                 cnt += 1;
             }
             println!("issue time: {:?}", timer.elapsed().as_secs_f64() - sub_time);
-            world.block_on(task_group.exec());
+            world.block_on(async move { task_group.exec().await });
         }
 
         world.barrier();
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index 6f7a981a..75bc18f3 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -61,10 +61,10 @@ fn main() {
 
     let global_darc = GlobalRwDarc::new(world.team(), 0).unwrap();
     println!("here 2");
-    let read_lock = world.block_on(global_darc.read());
+    let read_lock = global_darc.blocking_read();
     println!("I have the read lock!!!! {:?}", my_pe);
     drop(read_lock);
-    let write_lock = world.block_on(global_darc.write());
+    let write_lock = global_darc.blocking_write();
     println!("I have the write lock!!!! {:?}", my_pe);
     std::thread::sleep(std::time::Duration::from_secs(1));
     drop(write_lock);
@@ -112,7 +112,7 @@ fn main() {
             println!("here 8");
         } else {
             // println!("here");
-            *(*world.block_on(local_darc.write())) += 1;
+            *local_darc.blocking_write() += 1;
         }
     }
     // --------
diff --git a/examples/darc_examples/string_darc.rs b/examples/darc_examples/string_darc.rs
index 84cc74c2..37bf7cbb 100644
--- a/examples/darc_examples/string_darc.rs
+++ b/examples/darc_examples/string_darc.rs
@@ -19,19 +19,22 @@ impl LamellarAm for StringDarcAm {
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
-    let string_data = LocalRwDarc::new(&world, format!("Orig String on PE: {}", my_pe)).unwrap();
+    world.clone().block_on(async move {
+        let string_data =
+            LocalRwDarc::new(&world, format!("Orig String on PE: {}", my_pe)).unwrap();
 
-    println!("[PE: {}] {}", my_pe, world.block_on(string_data.read()));
+        println!("[PE: {}] {}", my_pe, string_data.read().await);
 
-    if my_pe == 0 {
-        world.block_on(world.exec_am_pe(
-            1,
-            StringDarcAm {
-                new_data: String::from("Modified string from 0"),
-                data: string_data.clone(),
-            },
-        ));
-    }
-    world.barrier();
-    println!("[PE: {}] {}", my_pe, world.block_on(string_data.read()));
+        if my_pe == 0 {
+            world.block_on(world.exec_am_pe(
+                1,
+                StringDarcAm {
+                    new_data: String::from("Modified string from 0"),
+                    data: string_data.clone(),
+                },
+            ));
+        }
+        world.barrier();
+        println!("[PE: {}] {}", my_pe, string_data.read().await);
+    });
 }
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index 0b2189ac..f0357a0a 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -203,6 +203,7 @@ fn dft_lamellar_am_group(
             );
         }
         let spec = spectrum.clone();
+        let world_clone = world.clone();
         pe_groups.push_back(async move {
             let res = local_sum_group.exec().await;
             let vec = (0..local_len)
@@ -214,7 +215,7 @@ fn dft_lamellar_am_group(
                     }
                 })
                 .collect::<Vec<_>>();
-            world
+            world_clone
                 .exec_am_pe(
                     pe,
                     RemoteSumAM {
@@ -225,7 +226,7 @@ fn dft_lamellar_am_group(
                 .await;
         });
     }
-    world.block_on(pe_groups.collect::<Vec<_>>());
+    world.block_on(async move { pe_groups.collect::<Vec<_>>().await });
 
     world.barrier();
     let time = timer.elapsed().as_secs_f64();
@@ -261,6 +262,7 @@ fn dft_lamellar_am_group_static(
             );
         }
         let spec = spectrum.clone();
+        let world_clone = world.clone();
         pe_groups.push_back(async move {
             let res = local_sum_group.exec().await;
             let vec = (0..local_len)
@@ -272,7 +274,7 @@ fn dft_lamellar_am_group_static(
                     }
                 })
                 .collect::<Vec<_>>();
-            world
+            world_clone
                 .exec_am_pe(
                     pe,
                     RemoteSumAM {
@@ -283,7 +285,9 @@ fn dft_lamellar_am_group_static(
                 .await;
         });
     }
-    world.block_on(pe_groups.collect::<Vec<_>>());
+    world.block_on(async move {
+        pe_groups.collect::<Vec<_>>().await;
+    });
 
     world.barrier();
     let time = timer.elapsed().as_secs_f64();
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index dd171fdd..a5ed9544 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -197,7 +197,7 @@ fn main() {
                         );
                     }
 
-                    let mut c_slice = c.block_on(c.write_local_data()); //this locks the array
+                    let mut c_slice = c.blocking_write_local_data(); //this locks the array
 
                     for row in 0..blocksize {
                         let row_offset = (i_blk * blocksize + row) * n;
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index ea0b37d5..afc8ab6c 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -638,7 +638,7 @@ use crate::lamellar_arch::IdError;
 use crate::lamellar_request::{InternalResult, LamellarRequestResult};
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::NetMemRegionHandle;
-use crate::scheduler::{ReqId, SchedulerQueue};
+use crate::scheduler::{Executor, LamellarExecutor, ReqId};
 // use log::trace;
 use async_trait::async_trait;
 use futures::Future;
@@ -856,9 +856,6 @@ pub(crate) enum Am {
     Return(ReqMetaData, LamellarArcAm), //req data, am to return and execute
     Data(ReqMetaData, LamellarResultArc), //req data, data to return
     Unit(ReqMetaData),                  //req data
-    _BatchedReturn(ReqMetaData, LamellarArcAm, ReqId), //req data, am to return and execute, batch id
-    _BatchedData(ReqMetaData, LamellarResultArc, ReqId), //req data, data to return, batch id
-    _BatchedUnit(ReqMetaData, ReqId),                  //req data, batch id
 }
 
 impl std::fmt::Debug for Am {
@@ -870,9 +867,6 @@ impl std::fmt::Debug for Am {
             Am::Return(_, _) => write!(f, "Return"),
             Am::Data(_, _) => write!(f, "Data"),
             Am::Unit(_) => write!(f, "Unit"),
-            Am::_BatchedReturn(_, _, _) => write!(f, "BatchedReturn"),
-            Am::_BatchedData(_, _, _) => write!(f, "BatchedData"),
-            Am::_BatchedUnit(_, _) => write!(f, "BatchedUnit"),
         }
     }
 }
@@ -1178,27 +1172,25 @@ pub trait ActiveMessaging {
     ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
     /// });
     ///```
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future;
+    fn block_on<F: Future>(&self, f: F) -> F::Output;
 }
 
 #[async_trait]
 pub(crate) trait ActiveMessageEngine {
     async fn process_msg(
-        &self,
+        self,
         am: Am,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        scheduler: Arc<Executor>,
         stall_mark: usize,
         immediate: bool,
     );
 
     async fn exec_msg(
-        &self,
+        self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        scheduler: Arc<Executor>,
     );
 
     fn get_team_and_world(
@@ -1232,39 +1224,39 @@ pub(crate) trait ActiveMessageEngine {
     }
 }
 
-#[derive(Debug)]
-pub(crate) enum ActiveMessageEngineType {
-    RegisteredActiveMessages(Arc<RegisteredActiveMessages>),
-}
+// #[derive(Debug)]
+// pub(crate) enum ActiveMessageEngineType {
+//     RegisteredActiveMessages(RegisteredActiveMessages),
+// }
 
-#[async_trait]
-impl ActiveMessageEngine for ActiveMessageEngineType {
-    async fn process_msg(
-        &self,
-        am: Am,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        stall_mark: usize,
-        immediate: bool,
-    ) {
-        match self {
-            ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-                remote_am
-                    .process_msg(am, scheduler, stall_mark, immediate)
-                    .await;
-            }
-        }
-    }
-    async fn exec_msg(
-        &self,
-        msg: Msg,
-        ser_data: SerializedData,
-        lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-    ) {
-        match self {
-            ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-                remote_am.exec_msg(msg, ser_data, lamellae, scheduler).await;
-            }
-        }
-    }
-}
+// #[async_trait]
+// impl ActiveMessageEngine for ActiveMessageEngineType {
+//     async fn process_msg(
+//         self,
+//         am: Am,
+//         executor: Arc<Executor>,
+//         stall_mark: usize,
+//         immediate: bool,
+//     ) {
+//         match self {
+//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
+//                 remote_am
+//                     .process_msg(am, executor, stall_mark, immediate)
+//                     .await;
+//             }
+//         }
+//     }
+//     async fn exec_msg(
+//         self,
+//         msg: Msg,
+//         ser_data: SerializedData,
+//         lamellae: Arc<Lamellae>,
+//         executor: Arc<Executor>,
+//     ) {
+//         match self {
+//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
+//                 remote_am.exec_msg(msg, ser_data, lamellae, executor).await;
+//             }
+//         }
+//     }
+// }
diff --git a/src/active_messaging/batching.rs b/src/active_messaging/batching.rs
index 1face148..6bbe638d 100644
--- a/src/active_messaging/batching.rs
+++ b/src/active_messaging/batching.rs
@@ -30,50 +30,41 @@ impl std::fmt::Debug for LamellarData {
 
 #[async_trait]
 pub(crate) trait Batcher {
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     );
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     );
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        stall_mark: usize,
-    );
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     );
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, stall_mark: usize);
 
     async fn exec_batched_msg(
         &self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    );
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am>;
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) enum BatcherType {
     Simple(SimpleBatcher),
     TeamAm(TeamAmBatcher),
@@ -82,74 +73,78 @@ pub(crate) enum BatcherType {
 #[async_trait]
 impl Batcher for BatcherType {
     #[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_remote_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_remote_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
         }
     }
     #[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_return_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_return_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
         }
     }
     #[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark)
+                batcher
+                    .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark)
+                batcher
+                    .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                    .await
             }
         }
     }
     #[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        stall_mark: usize,
-    ) {
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, stall_mark: usize) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_unit_am_to_batch(req_data, scheduler, stall_mark)
+                batcher.add_unit_am_to_batch(req_data, stall_mark).await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_unit_am_to_batch(req_data, scheduler, stall_mark)
+                batcher.add_unit_am_to_batch(req_data, stall_mark).await
             }
         }
     }
@@ -159,19 +154,14 @@ impl Batcher for BatcherType {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, ame)
-                    .await;
+                batcher.exec_batched_msg(msg, ser_data, lamellae, ame).await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, ame)
-                    .await;
+                batcher.exec_batched_msg(msg, ser_data, lamellae, ame).await
             }
         }
     }
diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index bdbe2dfe..bfb099c7 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -52,7 +52,7 @@ impl SimpleBatcherInner {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct SimpleBatcher {
     batched_ams: Arc<Vec<SimpleBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
@@ -61,13 +61,12 @@ pub(crate) struct SimpleBatcher {
 #[async_trait]
 impl Batcher for SimpleBatcher {
     #[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         // println!("add_remote_am_to_batch");
@@ -93,37 +92,34 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_remote_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("remote size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_remote_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         // println!("add_return_am_to_batch");
@@ -149,36 +145,33 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_rerturn_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("return size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_return_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         // println!("add_data_am_to_batch");
@@ -207,36 +200,29 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_data_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("data size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_data_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        mut stall_mark: usize,
-    ) {
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
         // println!("add_unit_am_to_batch");
         //let dst =req_data.dst;
         let batch = match req_data.dst {
@@ -256,26 +242,24 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_unit_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("unit size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_unit_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
@@ -285,28 +269,29 @@ impl Batcher for SimpleBatcher {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("executing batched msg {:?}", data.len());
-
+        let mut return_ams = Vec::new();
         while i < data.len() {
             let cmd: Cmd = crate::deserialize(&data[i..i + *CMD_LEN], false).unwrap();
             i += *CMD_LEN;
             // let temp_i = i;
             // println!("cmd {:?}", cmd);
             match cmd {
-                Cmd::Am => self.exec_am(&msg, data, &mut i, &lamellae, scheduler.clone(), ame),
+                Cmd::Am => return_ams.push(self.exec_am(&msg, data, &mut i, &lamellae, ame).await),
                 Cmd::ReturnAm => {
-                    self.exec_return_am(&msg, data, &mut i, &lamellae, scheduler.clone(), ame)
+                    self.exec_return_am(&msg, data, &mut i, &lamellae, ame)
+                        .await
                 }
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => panic!("should not recieve a batched msg within a batched msg"),
             }
         }
+        return_ams
     }
 }
 
@@ -521,15 +506,14 @@ impl SimpleBatcher {
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_am(
+    async fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Am {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -550,39 +534,35 @@ impl SimpleBatcher {
             team_addr: team.team.remote_ptr_addr,
         };
         // println!("[{:?}] exec_am submit task", std::thread::current().id());
-        let scheduler_clone = scheduler.clone();
-        let ame_clone = ame.clone();
-        scheduler.submit_task(async move {
-            let am = match am
-                .exec(
-                    team.team.world_pe,
-                    team.team.num_world_pes,
-                    false,
-                    world.clone(),
-                    team.clone(),
-                )
-                .await
-            {
-                LamellarReturn::Unit => Am::Unit(req_data),
-                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                    panic!("Should not be returning local data or AM from remote  am");
-                }
-            };
-            ame_clone.process_msg(am, scheduler_clone, 0, false).await;
-        });
+        let am = match am
+            .exec(
+                team.team.world_pe,
+                team.team.num_world_pes,
+                false,
+                world.clone(),
+                team.clone(),
+            )
+            .await
+        {
+            LamellarReturn::Unit => Am::Unit(req_data),
+            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                panic!("Should not be returning local data or AM from remote  am");
+            }
+        };
+        // ame.process_msg(am, 0, false).await;
+        am
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_return_am(
+    async fn exec_return_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
+        ame: &RegisteredActiveMessages,
     ) {
         // println!("exec_return_am");
         let am_header: AmHeader =
@@ -606,9 +586,8 @@ impl SimpleBatcher {
         //     "[{:?}] exec_return_am submit task",
         //     std::thread::current().id()
         // );
-        scheduler.submit_task(
-            ame.clone()
-                .exec_local_am(req_data, am.as_local(), world, team),
-        );
+        ame.clone()
+            .exec_local_am(req_data, am.as_local(), world, team)
+            .await;
     }
 }
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index d7c07916..60473bb7 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -167,7 +167,7 @@ impl TeamAmBatcherInner {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct TeamAmBatcher {
     batched_ams: Arc<Vec<TeamAmBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
@@ -176,13 +176,12 @@ pub(crate) struct TeamAmBatcher {
 #[async_trait]
 impl Batcher for TeamAmBatcher {
     #[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -198,49 +197,43 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("remote batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("remote size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -256,48 +249,43 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("return batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("return size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -320,48 +308,38 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("data batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("data size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
     #[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        mut stall_mark: usize,
-    ) {
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -375,38 +353,33 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("unit batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("unit size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
@@ -416,9 +389,8 @@ impl Batcher for TeamAmBatcher {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("i: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
@@ -435,18 +407,12 @@ impl Batcher for TeamAmBatcher {
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => {
-                    self.exec_batched_am(
-                        &msg,
-                        batch.cnt,
-                        data,
-                        &mut i,
-                        &lamellae,
-                        scheduler.clone(),
-                        &ame,
-                    );
+                    self.exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
+                        .await;
                 }
             }
         }
+        Vec::new()
     }
 }
 
@@ -707,16 +673,16 @@ impl TeamAmBatcher {
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_batched_am(
+    async fn exec_batched_am(
         &self,
         msg: &Msg,
         batch_cnt: usize,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
+        let mut return_ams = Vec::new();
         for _team in 0..batch_cnt {
             let team_header: TeamHeader =
                 crate::deserialize(&data[*i..*i + *TEAM_HEADER_LEN], false).unwrap();
@@ -733,48 +699,50 @@ impl TeamAmBatcher {
                 for _am in 0..batched_am_header.am_cnt {
                     // println!("am cmd: {:?}", batched_am_header.cmd);
                     match batched_am_header.cmd {
-                        Cmd::Am => self.exec_am(
-                            msg,
-                            data,
-                            i,
-                            lamellae,
-                            scheduler.clone(),
-                            ame,
-                            batched_am_header.am_id,
-                            world.clone(),
-                            team.clone(),
-                        ),
-                        Cmd::ReturnAm => self.exec_return_am(
-                            msg,
-                            data,
-                            i,
-                            lamellae,
-                            scheduler.clone(),
-                            ame,
-                            batched_am_header.am_id,
-                            world.clone(),
-                            team.clone(),
+                        Cmd::Am => return_ams.push(
+                            self.exec_am(
+                                msg,
+                                data,
+                                i,
+                                lamellae,
+                                batched_am_header.am_id,
+                                world.clone(),
+                                team.clone(),
+                            )
+                            .await,
                         ),
+                        Cmd::ReturnAm => {
+                            self.exec_return_am(
+                                msg,
+                                data,
+                                i,
+                                lamellae,
+                                ame,
+                                batched_am_header.am_id,
+                                world.clone(),
+                                team.clone(),
+                            )
+                            .await
+                        }
                         _ => panic!("unhandled cmd"),
                     }
                 }
             }
         }
+        return_ams
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_am(
+    async fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
         am_id: AmId,
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
-    ) {
+    ) -> Am {
         let req_id = crate::deserialize(&data[*i..*i + *REQ_ID_LEN], false).unwrap();
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
@@ -789,39 +757,36 @@ impl TeamAmBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        let scheduler_clone = scheduler.clone();
-        let ame_clone = ame.clone();
-        scheduler.submit_task(async move {
-            let am = match am
-                .exec(
-                    team.team.world_pe,
-                    team.team.num_world_pes,
-                    false,
-                    world.clone(),
-                    team.clone(),
-                )
-                .await
-            {
-                LamellarReturn::Unit => Am::Unit(req_data),
-                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                    panic!("Should not be returning local data or AM from remote  am");
-                }
-            };
-            ame_clone.process_msg(am, scheduler_clone, 0, false).await;
-        });
+
+        let am = match am
+            .exec(
+                team.team.world_pe,
+                team.team.num_world_pes,
+                false,
+                world.clone(),
+                team.clone(),
+            )
+            .await
+        {
+            LamellarReturn::Unit => Am::Unit(req_data),
+            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                panic!("Should not be returning local data or AM from remote  am");
+            }
+        };
+        am
+        // ame.process_msg(am, 0, false).await;
     }
 
     #[tracing::instrument(skip_all)]
-    fn exec_return_am(
+    async fn exec_return_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
+        ame: &RegisteredActiveMessages,
         am_id: AmId,
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
@@ -840,9 +805,9 @@ impl TeamAmBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        scheduler.submit_task(
-            ame.clone()
-                .exec_local_am(req_data, am.as_local(), world, team),
-        );
+
+        ame.clone()
+            .exec_local_am(req_data, am.as_local(), world, team)
+            .await;
     }
 }
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index 3a71cea0..fce885ec 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -6,7 +6,6 @@ use crate::lamellae::{
     SerializedData, SubData,
 };
 
-use crate::scheduler::SchedulerQueue;
 use async_recursion::async_recursion;
 // use log::trace;
 use std::sync::Arc;
@@ -62,7 +61,7 @@ pub struct RegisteredAm {
 }
 crate::inventory::collect!(RegisteredAm);
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct RegisteredActiveMessages {
     batcher: BatcherType,
 }
@@ -100,9 +99,9 @@ pub(crate) struct UnitHeader {
 impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
     #[tracing::instrument(skip_all)]
     async fn process_msg(
-        &self,
+        self,
         am: Am,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        executor: Arc<Executor>,
         stall_mark: usize,
         immediate: bool,
     ) {
@@ -118,14 +117,15 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 {
                     // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
                     if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                        self.batcher.add_remote_am_to_batch(
-                            req_data.clone(),
-                            am.clone(),
-                            am_id,
-                            am_size,
-                            scheduler,
-                            stall_mark,
-                        );
+                        self.batcher
+                            .add_remote_am_to_batch(
+                                req_data.clone(),
+                                am.clone(),
+                                am_id,
+                                am_size,
+                                stall_mark,
+                            )
+                            .await;
                     } else {
                         self.send_am(req_data.clone(), am.clone(), am_id, am_size, Cmd::Am)
                             .await;
@@ -150,9 +150,9 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                     let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                     let am_size = am.serialized_size();
                     if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                        self.batcher.add_remote_am_to_batch(
-                            req_data, am, am_id, am_size, scheduler, stall_mark,
-                        );
+                        self.batcher
+                            .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                            .await;
                     } else {
                         self.send_am(req_data, am, am_id, am_size, Cmd::Am).await;
                     }
@@ -168,9 +168,9 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                 let am_size = am.serialized_size();
                 if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                    self.batcher.add_return_am_to_batch(
-                        req_data, am, am_id, am_size, scheduler, stall_mark,
-                    );
+                    self.batcher
+                        .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                        .await;
                 } else {
                     self.send_am(req_data, am, am_id, am_size, Cmd::ReturnAm)
                         .await;
@@ -181,7 +181,8 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 let data_size = data.serialized_size();
                 if data_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
                     self.batcher
-                        .add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark);
+                        .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                        .await;
                 } else {
                     self.send_data_am(req_data, data, data_size).await;
                 }
@@ -189,60 +190,31 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
             Am::Unit(req_data) => {
                 if *UNIT_HEADER_LEN < crate::active_messaging::BATCH_AM_SIZE && !immediate {
                     self.batcher
-                        .add_unit_am_to_batch(req_data, scheduler, stall_mark);
+                        .add_unit_am_to_batch(req_data, stall_mark)
+                        .await;
                 } else {
                     self.send_unit_am(req_data).await;
                 }
             }
-            Am::_BatchedReturn(_req_data, _func, _batch_id) => {
-                // let func_id = *(AMS_IDS.get(&func.get_id()).unwrap());
-                // let func_size = func.serialized_size();
-                // if func_size <= crate::active_messaging::BATCH_AM_SIZE {
-                //     self.batcher
-                //         .add_batched_return_am_to_batch(
-                //             req_data, func, func_id, func_size, batch_id, scheduler,stall_mark
-                //         )
-                //         .await;
-                // } else {
-                //     self.send_batched_return_am(
-                //         req_data, func, func_id, func_size, batch_id, scheduler,
-                //     )
-                //     .await;
-                // }
-            }
-            Am::_BatchedData(_req_data, _data, _batch_id) => {
-                // let data_size = data.serialized_size();
-                // if data_size <= crate::active_messaging::BATCH_AM_SIZE {
-                //     self.add_batched_data_am_to_batch(
-                //         req_data, data, data_size, batch_id, scheduler,stall_mark
-                //     )
-                //     .await;
-                // } else {
-                //     self.send_batched_data_am(req_data, data, data_size, batch_id, scheduler)
-                //         .await;
-                // }
-            }
-            Am::_BatchedUnit(_req_data, _batch_id) => {
-                // self.add_batched_unit_am_to_batch(req_data, batch_id, scheduler,stall_mark)
-                //     .await;
-            }
         }
     }
 
     #[tracing::instrument(skip_all)]
     async fn exec_msg(
-        &self,
+        self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        executor: Arc<Executor>,
     ) {
         // println!("exec_msg");
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         match msg.cmd {
             Cmd::Am => {
-                self.exec_am(&msg, data, &mut i, &lamellae, scheduler).await;
+                let return_am = self.exec_am(&msg, data, &mut i, &lamellae).await;
+                let process_task = self.process_msg(return_am, executor.clone(), 0, false);
+                executor.submit_task(process_task);
             }
             Cmd::ReturnAm => {
                 self.exec_return_am(&msg, data, &mut i, &lamellae).await;
@@ -254,9 +226,15 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 self.exec_unit_am(&msg, data, &mut i).await;
             }
             Cmd::BatchedMsg => {
-                self.batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, self)
+                let ams = self
+                    .batcher
+                    .exec_batched_msg(msg, ser_data, lamellae, &self)
                     .await;
+                let am_tasks = futures::stream::FuturesUnordered::new();
+                for am in ams.into_iter() {
+                    am_tasks.push(self.clone().process_msg(am, executor.clone(), 0, false));
+                }
+                executor.submit_task(futures::future::join_all(am_tasks));
             }
         }
     }
@@ -459,8 +437,7 @@ impl RegisteredActiveMessages {
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-    ) {
+    ) -> Am {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -498,9 +475,8 @@ impl RegisteredActiveMessages {
                 panic!("Should not be returning local data or AM from remote  am");
             }
         };
-        self.process_msg(am, scheduler, 0, false).await; //0 just means we will force a stall_count loop
-                                                         // scheduler.submit_am(am);
-                                                         //TODO: compare against: scheduler.submit_am(ame, am).await;
+        am
+        // self.process_msg(am, 0, false).await; //0 just means we will force a stall_count loop
     }
 
     #[tracing::instrument(skip_all)]
diff --git a/src/array.rs b/src/array.rs
index b50861df..5fe5eed4 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -663,7 +663,7 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarWriteArra
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
             LamellarReadArray::AtomicArray(array) => array.reduce(reduction),
@@ -677,7 +677,7 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T>
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LamellarReadArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.sum() },
             LamellarReadArray::AtomicArray(array) => array.sum(),
@@ -686,7 +686,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
             LamellarReadArray::ReadOnlyArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.prod() },
             LamellarReadArray::AtomicArray(array) => array.prod(),
@@ -699,7 +699,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LamellarReadArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.max() },
             LamellarReadArray::AtomicArray(array) => array.max(),
@@ -708,7 +708,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
             LamellarReadArray::ReadOnlyArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.min() },
             LamellarReadArray::AtomicArray(array) => array.min(),
@@ -720,7 +720,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
             LamellarWriteArray::AtomicArray(array) => array.reduce(reduction),
@@ -732,7 +732,7 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LamellarWriteArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.sum() },
             LamellarWriteArray::AtomicArray(array) => array.sum(),
@@ -740,7 +740,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
             LamellarWriteArray::GlobalLockArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.prod() },
             LamellarWriteArray::AtomicArray(array) => array.prod(),
@@ -753,7 +753,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LamellarWriteArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.max() },
             LamellarWriteArray::AtomicArray(array) => array.max(),
@@ -761,7 +761,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
             LamellarWriteArray::GlobalLockArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.min() },
             LamellarWriteArray::AtomicArray(array) => array.min(),
@@ -991,9 +991,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> {
     /// let result = array.block_on(request); //block until am has executed
     /// // we also could have used world.block_on() or team.block_on()
     ///```
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future;
+    fn block_on<F: Future>(&self, f: F) -> F::Output;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Given a global index, calculate the PE and offset on that PE where the element actually resides.
@@ -1610,7 +1608,7 @@ where
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>>;
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// Interface for common arithmetic based reductions
@@ -1643,7 +1641,7 @@ where
     /// let sum = array.block_on(array.sum());
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
@@ -1668,7 +1666,7 @@ where
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// Interface for common compare based reductions
@@ -1696,7 +1694,7 @@ where
     /// let max = array.block_on(array.max());
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
@@ -1718,7 +1716,7 @@ where
     /// let min = array.block_on(array.min());
     /// assert_eq!(0,min);
     ///```
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// This procedural macro is used to enable the execution of user defined reductions on LamellarArrays.
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 9a4a68f4..896876b0 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1160,7 +1160,7 @@ impl<T: Dist> From<AtomicByteArray> for AtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
             AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
@@ -1171,13 +1171,13 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for AtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.sum(),
             AtomicArray::GenericAtomicArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.prod(),
             AtomicArray::GenericAtomicArray(array) => array.prod(),
@@ -1187,13 +1187,13 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for AtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.max(),
             AtomicArray::GenericAtomicArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.min(),
             AtomicArray::GenericAtomicArray(array) => array.min(),
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 2924f8d8..382059a4 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -712,10 +712,7 @@ impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -795,7 +792,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GenericAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GenericAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -804,20 +801,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GenericAtomicArray<T
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for GenericAtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for GenericAtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 9f766681..6b9ff9ef 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -25,7 +25,7 @@ use std::ops::{Deref, DerefMut};
 /// Direct RDMA operations can occur if the appropriate lock is held.
 #[lamellar_impl::AmDataRT(Clone, Debug)]
 pub struct GlobalLockArray<T> {
-    lock: GlobalRwDarc<()>,
+    pub(crate) lock: GlobalRwDarc<()>,
     pub(crate) array: UnsafeArray<T>,
 }
 
@@ -70,27 +70,26 @@ impl GlobalLockByteArrayWeak {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct GlobalLockMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
+pub struct GlobalLockMutLocalData<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
     _lock_guard: GlobalRwDarcWriteGuard<()>,
 }
 
-// impl<T: Dist> Drop for GlobalLockMutLocalData<'_, T>{
+// impl<T: Dist> Drop for GlobalLockMutLocalData<T>{
 //     fn drop(&mut self){
 //         println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for GlobalLockMutLocalData<'_, T> {
+impl<T: Dist> Deref for GlobalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for GlobalLockMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for GlobalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -103,27 +102,26 @@ impl<T: Dist> DerefMut for GlobalLockMutLocalData<'_, T> {
 ///
 /// When each PE drops its instance, the lock is release.
 #[derive(Debug)]
-pub struct GlobalLockCollectiveMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
+pub struct GlobalLockCollectiveMutLocalData<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
     _lock_guard: GlobalRwDarcCollectiveWriteGuard<()>,
 }
 
-// impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<'_, T>{
+// impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<T>{
 //     fn drop(&mut self){
 //         println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for GlobalLockCollectiveMutLocalData<'_, T> {
+impl<T: Dist> Deref for GlobalLockCollectiveMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -135,33 +133,29 @@ impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<'_, T> {
 /// (allowing for the safe deref into `&[T]`), preventing any local or remote write access.
 ///
 /// When the instance is dropped the lock is released.
-pub struct GlobalLockLocalData<'a, T: Dist> {
+pub struct GlobalLockLocalData<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
-    pub(crate) data: &'a [T],
-    index: usize,
     lock: GlobalRwDarc<()>,
     lock_guard: GlobalRwDarcReadGuard<()>,
 }
 
-impl<'a, T: Dist + std::fmt::Debug> std::fmt::Debug for GlobalLockLocalData<'a, T> {
+impl<T: Dist + std::fmt::Debug> std::fmt::Debug for GlobalLockLocalData<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.data)
+        write!(f, "{:?}", self.deref())
     }
 }
 
-impl<'a, T: Dist> Clone for GlobalLockLocalData<'a, T> {
+impl<T: Dist> Clone for GlobalLockLocalData<T> {
     fn clone(&self) -> Self {
         GlobalLockLocalData {
             array: self.array.clone(),
-            data: self.data,
-            index: self.index,
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-impl<'a, T: Dist> GlobalLockLocalData<'a, T> {
+impl<T: Dist> GlobalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
     /// # Examples
@@ -177,27 +171,30 @@ impl<'a, T: Dist> GlobalLockLocalData<'a, T> {
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
-    pub fn into_sub_data(self, start: usize, end: usize) -> GlobalLockLocalData<'a, T> {
+    pub fn into_sub_data(self, start: usize, end: usize) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
-            array: self.array.clone(),
-            data: &self.data[start..end],
-            index: 0,
+            array: self.array.sub_array(start..end),
             lock: self.lock,
             lock_guard: self.lock_guard,
         }
     }
 }
 
-impl<'a, T: Dist + serde::Serialize> serde::Serialize for GlobalLockLocalData<'a, T> {
+impl<T: Dist + serde::Serialize> serde::Serialize for GlobalLockLocalData<T> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
     {
-        self.data.serialize(serializer)
+        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
     }
 }
 
-impl<'a, T: Dist> Iterator for GlobalLockLocalData<'a, T> {
+pub struct GlobalLockLocalDataIter<'a, T: Dist> {
+    data: &'a [T],
+    index: usize,
+}
+
+impl<'a, T: Dist> Iterator for GlobalLockLocalDataIter<'a, T> {
     type Item = &'a T;
     fn next(&mut self) -> Option<Self::Item> {
         if self.index < self.data.len() {
@@ -209,11 +206,22 @@ impl<'a, T: Dist> Iterator for GlobalLockLocalData<'a, T> {
     }
 }
 
-impl<T: Dist> Deref for GlobalLockLocalData<'_, T> {
+impl<'a, T: Dist> IntoIterator for &'a GlobalLockLocalData<T> {
+    type Item = &'a T;
+    type IntoIter = GlobalLockLocalDataIter<'a, T>;
+    fn into_iter(self) -> Self::IntoIter {
+        GlobalLockLocalDataIter {
+            data: unsafe { self.array.array.local_as_mut_slice() },
+            index: 0,
+        }
+    }
+}
+
+impl<T: Dist> Deref for GlobalLockLocalData<T> {
     type Target = [T];
 
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -268,36 +276,37 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn read_local_data(&self) -> GlobalLockLocalData<'_, T> {
-    //     GlobalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: self.lock.read(),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
+    ///
+    /// Calling this function will result in a local read lock being captured on the array
+    ///
+    /// This function is blocking and intended to be called from non asynchronous contexts.
+    /// Calling within an asynchronous block may lead to deadlock.
+    ///
+    /// # One-sided Operation
+    /// Only returns local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_read_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_read_local_data(&self) -> GlobalLockLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            GlobalLockLocalData {
+                array: self_clone.clone(),
+                lock: self_clone.lock.clone(),
+                lock_guard: self_clone.lock.read().await,
+            }
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
@@ -314,52 +323,54 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.read_local_data());
+    /// let local_data = array.read_local_data().await;
     /// println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn read_local_data(&self) -> GlobalLockLocalData<'_, T> {
+    pub async fn read_local_data(&self) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
             array: self.clone(),
-            data: unsafe { self.array.local_as_mut_slice() },
-            index: 0,
             lock: self.lock.clone(),
             lock_guard: self.lock.read().await,
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the global write lock being captured on the array.
-    // ///.
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn write_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     let lock = self.lock.write();
-    //     let data = GlobalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: lock,
-    //     };
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     data
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
+    ///
+    /// Calling this function will result in the global write lock being captured on the array.
+    ///.
+    /// This function is blocking and intended to be called from non asynchronous contexts.
+    /// Calling within an asynchronous block may lead to deadlock.
+    ///
+    /// # One-sided Operation
+    /// Only returns (mutable) local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_write_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_write_local_data(&self) -> GlobalLockMutLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.write().await;
+            let data = GlobalLockMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
@@ -376,23 +387,23 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.write_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.write_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn write_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
+    pub async fn write_local_data(&self) -> GlobalLockMutLocalData<T> {
         let lock = self.lock.write().await;
         let data = GlobalLockMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    #[doc(alias("Collective"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
     ///
     /// Calling this function will result in the collective write lock being captured on the array
@@ -409,97 +420,54 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.collective_write_local_data());
+    /// let local_data = array.blocking_collective_write_local_data();
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
-    pub async fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<'_, T> {
+    pub fn blocking_collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.collective_write().await;
+            let data = GlobalLockCollectiveMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
+
+    #[doc(alias("Collective"))]
+    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
+    ///
+    /// Calling this function will result in the collective write lock being captured on the array
+    ///
+    /// # Collective Operation
+    /// All PEs associated with this array must enter the call, otherwise deadlock will occur.
+    /// Upon return every PE will hold a special collective write lock so that they can all access their local data simultaneous
+    /// This lock prevents any other access from occuring on the array until it is dropped on all the PEs.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
+    ///    let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    ///    let local_data = array.collective_write_local_data().await;
+    ///    println!("PE{my_pe} data: {local_data:?}");
+    /// });
+    ///```
+    pub async fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
         let lock = self.lock.collective_write().await;
         let data = GlobalLockCollectiveMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    // #[doc(hidden)] //todo create a custom macro to emit a warning saying use read_local_slice/write_local_slice intead
-    // pub(crate) async fn local_as_slice(&self) -> GlobalLockLocalData<'_, T> {
-    //     let the_lock = self.lock.read().await;
-    //     GlobalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: the_lock,
-    //     }
-    // }
-    // #[doc(hidden)]
-    // pub unsafe fn local_as_mut_slice(&self) -> &mut [T] {
-    //     self.array.local_as_mut_slice()
-    // }
-
-    // #[doc(hidden)]
-    // pub(crate) async fn local_as_mut_slice(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     let the_lock = self.lock.write().await;
-    //     let lock = GlobalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: the_lock,
-    //     };
-    //     // println!("have lla write lock");
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][GlobalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn local_data(&self) -> GlobalLockLocalData<'_, T> {
-    //     self.local_as_slice().await
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe immutable access to local elements.   
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][GlobalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.mut_local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn mut_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     self.local_as_mut_slice().await
-    // }
-
     #[doc(hidden)]
     pub unsafe fn __local_as_slice(&self) -> &[T] {
         self.array.local_as_mut_slice()
@@ -834,10 +802,7 @@ impl<T: Dist> LamellarArray<T> for GlobalLockArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -938,8 +903,9 @@ impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
-        let lock = self.array.block_on(self.lock.read());
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         Box::new(GlobalLockArrayReduceHandle {
             req: self.array.reduce_data(op, self.clone().into()),
             _lock_guard: lock,
@@ -950,20 +916,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for GlobalLockArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for GlobalLockArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 37a4c168..70c4db61 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -301,7 +301,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     type OnesidedIter = OneSidedIter<'static, T, Self>;
 
     fn dist_iter(&self) -> Self::DistIter {
-        let lock = self.array.block_on(self.lock.read());
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         self.barrier();
         GlobalLockDistIter {
             data: self.clone(),
@@ -313,7 +314,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock = self.array.block_on(self.lock.read());
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         GlobalLockLocalIter {
             data: self.clone(),
             lock: lock,
@@ -341,7 +343,11 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     type LocalIter = GlobalLockLocalIterMut<T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.collective_write()));
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(
+            self.array
+                .block_on(async move { lock.collective_write().await }),
+        );
         self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         GlobalLockDistIterMut {
@@ -354,7 +360,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter_mut(&self) -> Self::LocalIter {
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         GlobalLockLocalIterMut {
             data: self.clone(),
             lock: lock,
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 2d58ceca..76229c8f 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -4,7 +4,6 @@ use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::IterRequest;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
-use crate::scheduler::SchedulerQueue;
 use crate::Darc;
 
 use async_trait::async_trait;
@@ -72,7 +71,7 @@ impl LamellarAm for UpdateCntAm {
 }
 
 impl RemoteIterCountHandle {
-    async fn reduce_remote_counts(&self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
+    async fn reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
         self.team
             .exec_am_all(UpdateCntAm {
                 remote_cnt: local_cnt,
@@ -111,6 +110,7 @@ impl IterRequest for RemoteIterCountHandle {
             .sum::<usize>();
         self.team
             .scheduler
+            .clone()
             .block_on(self.reduce_remote_counts(count, cnt))
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index de7dede8..ea2ce0b8 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -72,7 +72,7 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     fn reduce_remote_vals(&self, local_val: Option<T>) -> Option<T> {
-        self.team.barrier();
+        self.team.tasking_barrier();
         let local_vals =
             UnsafeArray::<Option<T>>::new(&self.team, self.team.num_pes, Distribution::Block);
         unsafe {
diff --git a/src/array/iterator/one_sided_iterator/buffered.rs b/src/array/iterator/one_sided_iterator/buffered.rs
index 8a42178a..09650d96 100644
--- a/src/array/iterator/one_sided_iterator/buffered.rs
+++ b/src/array/iterator/one_sided_iterator/buffered.rs
@@ -1,7 +1,5 @@
 use crate::array::iterator::one_sided_iterator::*;
 use crate::array::LamellarArrayRequest;
-// use crate::LamellarArray;
-// use crate::scheduler::SchedulerQueue;
 use crate::memregion::OneSidedMemoryRegion;
 use std::collections::VecDeque;
 use std::ops::Deref;
@@ -91,8 +89,6 @@ impl<U> Deref for BufferedItem<U> {
     }
 }
 
-
-
 impl<I> OneSidedIterator for Buffered<I>
 where
     I: OneSidedIterator + Send,
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 9467692d..d897e922 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -73,27 +73,26 @@ impl LocalLockByteArrayWeak {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct LocalLockMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
-    _lock_guard: RwLockWriteGuardArc<Box<()>>,
+pub struct LocalLockMutLocalData<T: Dist> {
+    array: LocalLockArray<T>,
+    _lock_guard: RwLockWriteGuardArc<()>,
 }
 
-// impl<T: Dist> Drop for LocalLockMutLocalData<'_, T> {
+// impl<T: Dist> Drop for LocalLockMutLocalData<T> {
 //     fn drop(&mut self) {
 //         // println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for LocalLockMutLocalData<'_, T> {
+impl<T: Dist> Deref for LocalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for LocalLockMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for LocalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -106,28 +105,24 @@ impl<T: Dist> DerefMut for LocalLockMutLocalData<'_, T> {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct LocalLockLocalData<'a, T: Dist> {
+pub struct LocalLockLocalData<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
-    pub(crate) data: &'a [T],
-    index: usize,
     lock: LocalRwDarc<()>,
-    lock_guard: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock_guard: Arc<RwLockReadGuardArc<()>>,
 }
 
-impl<'a, T: Dist> Clone for LocalLockLocalData<'a, T> {
+impl<'a, T: Dist> Clone for LocalLockLocalData<T> {
     fn clone(&self) -> Self {
         // println!("getting read lock in LocalLockLocalData clone");
         LocalLockLocalData {
             array: self.array.clone(),
-            data: self.data,
-            index: self.index,
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-// impl<'a, T: Dist> Drop for LocalLockLocalData<'a, T> {
+// impl<'a, T: Dist> Drop for LocalLockLocalData<T> {
 //     fn drop(&mut self) {
 //         println!(
 //             "dropping read lock {:?}",
@@ -136,13 +131,13 @@ impl<'a, T: Dist> Clone for LocalLockLocalData<'a, T> {
 //     }
 // }
 
-// impl<'a, T: Dist> Drop for LocalLockMutLocalData<'a, T> {
+// impl<'a, T: Dist> Drop for LocalLockMutLocalData<T> {
 //     fn drop(&mut self) {
 //         println!("dropping write lock");
 //     }
 // }
 
-impl<'a, T: Dist> LocalLockLocalData<'a, T> {
+impl<'a, T: Dist> LocalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
     /// # Examples
@@ -158,27 +153,30 @@ impl<'a, T: Dist> LocalLockLocalData<'a, T> {
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
-    pub fn into_sub_data(self, start: usize, end: usize) -> LocalLockLocalData<'a, T> {
+    pub fn into_sub_data(self, start: usize, end: usize) -> LocalLockLocalData<T> {
         LocalLockLocalData {
-            array: self.array.clone(),
-            data: &self.data[start..end],
-            index: 0,
+            array: self.array.sub_array(start..end),
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<'a, T> {
+impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<T> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
     {
-        self.data.serialize(serializer)
+        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
     }
 }
 
-impl<'a, T: Dist> Iterator for LocalLockLocalData<'a, T> {
+pub struct LocalLockLocalDataIter<'a, T: Dist> {
+    data: &'a [T],
+    index: usize,
+}
+
+impl<'a, T: Dist> Iterator for LocalLockLocalDataIter<'a, T> {
     type Item = &'a T;
     fn next(&mut self) -> Option<Self::Item> {
         if self.index < self.data.len() {
@@ -190,11 +188,22 @@ impl<'a, T: Dist> Iterator for LocalLockLocalData<'a, T> {
     }
 }
 
-impl<T: Dist> Deref for LocalLockLocalData<'_, T> {
+impl<'a, T: Dist> IntoIterator for &'a LocalLockLocalData<T> {
+    type Item = &'a T;
+    type IntoIter = LocalLockLocalDataIter<'a, T>;
+    fn into_iter(self) -> Self::IntoIter {
+        LocalLockLocalDataIter {
+            data: unsafe { self.array.array.local_as_mut_slice() },
+            index: 0,
+        }
+    }
+}
+
+impl<T: Dist> Deref for LocalLockLocalData<T> {
     type Target = [T];
 
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -250,34 +259,35 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn read_local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     // println!("getting read lock in read_local_local");
-    //     LocalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: Arc::new(self.lock.read()),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
+    ///
+    /// Calling this function will result in a local read lock being captured on the array
+    ///
+    /// # One-sided Operation
+    /// Only returns local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_read_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_read_local_data(&self) -> LocalLockLocalData<T> {
+        // println!("getting read lock in read_local_local");
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            LocalLockLocalData {
+                array: self_clone.clone(),
+                lock: self_clone.lock.clone(),
+                lock_guard: Arc::new(self_clone.lock.read().await),
+            }
+        })
+    }
 
     /// TODO: UPDATE
     /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
@@ -292,51 +302,53 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.read_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.read_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn read_local_data(&self) -> LocalLockLocalData<'_, T> {
+    pub async fn read_local_data(&self) -> LocalLockLocalData<T> {
         // println!("getting read lock in read_local_local");
         LocalLockLocalData {
             array: self.clone(),
-            data: unsafe { self.array.local_as_mut_slice() },
-            index: 0,
             lock: self.lock.clone(),
             lock_guard: Arc::new(self.lock.read().await),
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the local write lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn write_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     // println!("getting write lock in write_local_data");
-    //     let lock = self.lock.write();
-    //     let data = LocalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: lock,
-    //     };
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     data
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
+    ///
+    /// Calling this function will result in the local write lock being captured on the array
+    ///
+    /// # One-sided Operation
+    /// Only returns (mutable) local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_write_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_write_local_data(&self) -> LocalLockMutLocalData<T> {
+        // println!("getting write lock in write_local_data");
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.write().await;
+            let data = LocalLockMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// TODO: UPDATE
@@ -352,150 +364,24 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.write_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.write_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn write_local_data(&self) -> LocalLockMutLocalData<'_, T> {
+    pub async fn write_local_data(&self) -> LocalLockMutLocalData<T> {
         // println!("getting write lock in write_local_data");
         let lock = self.lock.write().await;
         let data = LocalLockMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    // #[doc(hidden)] //todo create a custom macro to emit a warning saying use read_local_slice/write_local_slice intead
-    // pub(crate) async fn local_as_slice(&self) -> LocalLockLocalData<'_, T> {
-    //     // println!("getting read lock in local_as_slice");
-    //     let lock = LocalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: Arc::new(self.lock.read().await),
-    //     };
-    //     // println!("got read lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-    // #[doc(hidden)]
-    // pub unsafe fn local_as_mut_slice(&self) -> &mut [T] {
-    //     self.array.local_as_mut_slice()
-    // }
-
-    // #[doc(hidden)]
-    // pub(crate) async fn local_as_mut_slice(&self) -> LocalLockMutLocalData<'_, T> {
-    //     // println!("getting write lock in local_as_mut_slice");
-    //     let the_lock = self.lock.write().await;
-    //     let lock = LocalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: the_lock,
-    //     };
-    //     // println!("have lla write lock");
-    //     // println!("got write lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][LocalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     self.local_as_slice()
-    // }
-
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][LocalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     self.read_local_data().await
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][LocalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.mut_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn mut_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     self.local_as_mut_slice()
-    // }
-
-    // /// TODO: UPDATE
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][LocalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.mut_local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn mut_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     self.write_local_data().await
-    // }
-
     #[doc(hidden)]
     pub unsafe fn __local_as_slice(&self) -> &[T] {
         self.array.local_as_mut_slice()
@@ -830,10 +716,7 @@ impl<T: Dist> LamellarArray<T> for LocalLockArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -916,7 +799,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
 #[doc(hidden)]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
     req: Box<dyn LamellarRequest<Output = T>>,
-    _lock_guard: RwLockReadGuardArc<Box<()>>,
+    _lock_guard: RwLockReadGuardArc<()>,
 }
 
 #[async_trait]
@@ -931,8 +814,9 @@ impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
-        let lock = self.array.block_on(self.lock.read());
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         Box::new(LocalLockArrayReduceHandle {
             req: self.array.reduce_data(op, self.clone().into()),
             _lock_guard: lock,
@@ -943,20 +827,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LocalLockArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LocalLockArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 6de62980..a1d4479c 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -20,7 +20,7 @@ use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock: Arc<RwLockReadGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -42,7 +42,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIter<'a, T> {
 #[derive(Clone)]
 pub struct LocalLockLocalIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock: Arc<RwLockReadGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -158,7 +158,7 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalIter<'static, T>
 #[derive(Clone)]
 pub struct LocalLockDistIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<Box<()>>>,
+    lock: Arc<RwLockWriteGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -179,7 +179,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIterMut<'a, T> {
 #[derive(Clone)]
 pub struct LocalLockLocalIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<Box<()>>>,
+    lock: Arc<RwLockWriteGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -305,7 +305,9 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     type OnesidedIter = OneSidedIter<'static, T, Self>;
 
     fn dist_iter(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.read()));
+        // let the_array: LocalLockArray<T> = self.clone();
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
         self.barrier();
         LocalLockDistIter {
             data: self.clone(),
@@ -317,7 +319,8 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock = Arc::new(self.array.block_on(self.lock.read()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
         LocalLockLocalIter {
             data: self.clone(),
             lock: lock,
@@ -345,7 +348,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     type LocalIter = LocalLockLocalIterMut<'static, T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         LocalLockDistIterMut {
@@ -359,7 +363,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
 
     fn local_iter_mut(&self) -> Self::LocalIter {
         // println!("trying to get write lock for iter");
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         // println!("got write lock for iter");
         LocalLockLocalIterMut {
             data: self.clone(),
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 7e0e046b..590f9b48 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1134,10 +1134,7 @@ impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -1207,7 +1204,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for NativeAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for NativeAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -1216,20 +1213,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for NativeAtomicArray<T>
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for NativeAtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for NativeAtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/operations.rs b/src/array/operations.rs
index 0632c629..e064bd73 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -6,7 +6,7 @@ use crate::array::local_lock_atomic::*;
 use crate::array::native_atomic::*;
 use crate::array::{AmDist, Dist, LamellarArrayRequest, LamellarEnv, LamellarWriteArray};
 use crate::lamellar_request::LamellarRequest;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use crate::LamellarTeamRT;
 
 pub(crate) mod access;
@@ -230,8 +230,8 @@ pub enum OpInputEnum<'a, T: Dist> {
     Vec(Vec<T>),
     NativeAtomicLocalData(NativeAtomicLocalData<T>),
     GenericAtomicLocalData(GenericAtomicLocalData<T>),
-    LocalLockLocalData(LocalLockLocalData<'a, T>),
-    GlobalLockLocalData(GlobalLockLocalData<'a, T>),
+    LocalLockLocalData(LocalLockLocalData<T>),
+    GlobalLockLocalData(GlobalLockLocalData<T>),
     // Iter(Box<dyn Iterator<Item = T> + 'a>),
 
     // while it would be convienient to directly use the following, doing so
@@ -244,7 +244,7 @@ pub enum OpInputEnum<'a, T: Dist> {
     // AtomicArray(AtomicArray<T>),
 }
 
-impl<'a, T: Dist> OpInputEnum<'_, T> {
+impl<'a, T: Dist> OpInputEnum<'a, T> {
     #[tracing::instrument(skip_all)]
     pub(crate) fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
         match self {
@@ -305,52 +305,47 @@ impl<'a, T: Dist> OpInputEnum<'_, T> {
     // #[tracing::instrument(skip_all)]
     pub(crate) fn into_vec_chunks(self, chunk_size: usize) -> Vec<Vec<T>> {
         match self {
-            OpInputEnum::Val(v) => vec![vec![v]],
+            OpInputEnum::Val(v) =>vec![vec![v]],
             OpInputEnum::Slice(s) => s.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
             OpInputEnum::Vec(v) => v.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
             OpInputEnum::NativeAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                a.iter()
-                    .enumerate()
-                    .filter_map(move |(i, elem)| {
-                        data.push(elem.load());
-                        if data.len() == chunk_size || i == a.len() - 1 {
-                            let mut new_data = Vec::with_capacity(chunk_size);
-                            std::mem::swap(&mut data, &mut new_data);
-                            Some(new_data)
-                        } else {
-                            None
-                        }
-                    })
-                    .collect()
+                a.iter().enumerate().filter_map(move |(i, elem)| {
+                    data.push(elem.load());
+                    if data.len() == chunk_size || i == a.len() - 1 {
+                        let mut new_data = Vec::with_capacity(chunk_size);
+                        std::mem::swap(&mut data, &mut new_data);
+                        Some(new_data)
+                    } else {
+                        None
+                    }
+                }).collect()
             }
             OpInputEnum::GenericAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                a.iter()
-                    .enumerate()
-                    .filter_map(move |(i, elem)| {
-                        data.push(elem.load());
-                        if data.len() == chunk_size || i == a.len() - 1 {
-                            let mut new_data = Vec::with_capacity(chunk_size);
-                            std::mem::swap(&mut data, &mut new_data);
-                            Some(new_data)
-                        } else {
-                            None
-                        }
-                    })
-                    .collect()
+                a.iter().enumerate().filter_map(move |(i, elem)| {
+                    data.push(elem.load());
+                    if data.len() == chunk_size || i == a.len() - 1 {
+                        let mut new_data = Vec::with_capacity(chunk_size);
+                        std::mem::swap(&mut data, &mut new_data);
+                        Some(new_data)
+                    } else {
+                        None
+                    }
+                }).collect()
             }
             OpInputEnum::LocalLockLocalData(a) => {
                 a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
             }
             OpInputEnum::GlobalLockLocalData(a) => {
                 a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
-            } // OpInputEnum::MemoryRegion(mr) => *unsafe { mr.as_slice() }
-              //     .expect("memregion not local")
-              //     .first()
-              //     .expect("memregion is empty"),
+            }
+            // OpInputEnum::MemoryRegion(mr) => *unsafe { mr.as_slice() }
+            //     .expect("memregion not local")
+            //     .first()
+            //     .expect("memregion is empty"),
         }
     }
 
@@ -687,7 +682,7 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
 //     }
 // }
 
-impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<'_, T> {
+impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
     #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
         let len = self.len();
@@ -727,7 +722,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<'_, T> {
     }
 }
 
-impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<'_, T> {
+impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
     #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
         let len = self.len();
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 430e8882..942c2fad 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -467,7 +467,7 @@ impl<T: Dist> From<LamellarByteArray> for ReadOnlyArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for ReadOnlyArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -476,20 +476,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for ReadOnlyArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for ReadOnlyArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for ReadOnlyArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
@@ -550,10 +550,7 @@ impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 715f0f3b..bb455826 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -11,7 +11,6 @@ use crate::darc::{Darc, DarcMode, WeakDarc};
 use crate::lamellae::AllocationType;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::{Dist, MemoryRegion};
-use crate::scheduler::SchedulerQueue;
 use crate::LamellarTaskGroup;
 use core::marker::PhantomData;
 use std::ops::Bound;
@@ -368,9 +367,10 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.wait_all();
         // println!("block on outstanding");
         // self.inner.data.print();
+        // let the_array: UnsafeArray<T> = self.clone();
+        let array_darc = self.inner.data.clone();
         self.team_rt()
-            .block_on(self.inner.data.block_on_outstanding(mode, 0));
-        // self.inner.data.print();
+            .block_on(array_darc.block_on_outstanding(mode, 1)); //one for this instance of the array
     }
 
     #[doc(alias = "Collective")]
@@ -810,10 +810,7 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.inner.data.team.scheduler.block_on(f)
     }
 
@@ -998,7 +995,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
     ///```
-    pub unsafe fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce_data(op, self.clone().into()).into_future()
     }
 
@@ -1034,7 +1031,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(unsafe{array.sum()}); //Safe in this instance as we have ensured no updates are currently happening
     /// // assert_eq!(array.len()*num_pes,sum);//this may or may not fail
     ///```
-    pub unsafe fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
 
@@ -1071,7 +1068,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    pub unsafe fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 
@@ -1102,7 +1099,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let max = array.block_on(max_req);
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    pub unsafe fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
 
@@ -1133,7 +1130,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let min = array.block_on(min_req);
     /// assert_eq!(0,min);
     ///```
-    pub unsafe fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 35e1844f..80d27ed4 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -2,7 +2,6 @@ use crate::active_messaging::LamellarArcAm;
 use crate::array::operations::*;
 use crate::array::r#unsafe::UnsafeArray;
 use crate::array::{AmDist, Dist, LamellarArray, LamellarByteArray, LamellarEnv};
-use crate::scheduler::SchedulerQueue;
 use futures::Future;
 use parking_lot::Mutex;
 use std::any::TypeId;
@@ -394,13 +393,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
             let index_vec = index.to_vec();
+            let the_array: UnsafeArray<T> = self.clone();
             // println!("num_reqs {:?}",num_reqs);
             let the_array: UnsafeArray<T> = self.clone();
             self.inner
                 .data
                 .team
                 .scheduler
-                .submit_immediate_task2(async move {
+                .submit_immediate_task(async move {
                     let mut buffs =
                         vec![Vec::with_capacity(num_per_batch * index_size.len()); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
@@ -486,12 +486,12 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             start_i += len;
         }
 
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
+        while cnt.load(Ordering::SeqCst) < num_reqs {
+            self.inner.data.team.scheduler.exec_task();
+        }
         // println!("futures len {:?}",futures.lock().len());
         Box::pin(async move {
-            while cnt.load(Ordering::SeqCst) < num_reqs {
-                // self.inner.data.team.scheduler.exec_task();
-                async_std::task::yield_now().await;
-            }
             // println!("futures len {:?}",futures.lock().len());
             futures::future::join_all(futures.lock().drain(..)).await
         })
@@ -526,6 +526,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         let num_reqs = vals.len();
         // println!("num_reqs {:?}",num_reqs);
         let mut start_i = 0;
+        let scheduler = self.inner.data.team.scheduler.clone();
         for val in vals.drain(..) {
             let cnt2 = cnt.clone();
             let futures2 = futures.clone();
@@ -533,60 +534,54 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let len = val.len();
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
-            let val_chunks = val.into_vec_chunks(num_per_batch);
             let the_array: UnsafeArray<T> = self.clone();
-            self.inner
-                .data
-                .team
-                .scheduler
-                .submit_immediate_task2(async move {
-                    // let mut buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
-                    // let val_slice = val.as_slice();
-                    let mut inner_start_i = start_i;
-                    let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
-                        Vec::new();
-                    // val.as_vec_chunks(num_per_batch)
-                    val_chunks.into_iter().for_each(|val| {
-                        let val_len = val.len();
-                        let am = MultiValSingleIndex::new_with_vec(
-                            byte_array2.clone(),
-                            op,
-                            local_index,
-                            val,
-                        )
-                        .into_am::<T>(ret);
-                        let req = the_array
-                            .inner
-                            .data
-                            .team
-                            .exec_arc_am_pe::<R>(
-                                pe,
-                                am,
-                                Some(the_array.inner.data.array_counters.clone()),
-                            )
-                            .into_future();
-                        // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
-                        let res_buffer =
-                            (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
-                        reqs.push(Box::pin(async move { (req.await, res_buffer) }));
-                        inner_start_i += val_len;
-                    });
-                    // println!("reqs len {:?}",reqs.len());
-                    futures2.lock().extend(reqs);
-                    cnt2.fetch_add(1, Ordering::SeqCst);
-                    the_array
+            let val_chunks = val.into_vec_chunks(num_per_batch);
+            scheduler.submit_immediate_task(async move {
+                let mut inner_start_i = start_i;
+                let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
+                    Vec::new();
+                val_chunks.into_iter().for_each(|val| {
+                    let val_len = val.len();
+                    let am = MultiValSingleIndex::new_with_vec(
+                        byte_array2.clone(),
+                        op,
+                        local_index,
+                        val,
+                    )
+                    .into_am::<T>(ret);
+                    let req = the_array
                         .inner
                         .data
-                        .array_counters
-                        .outstanding_reqs
-                        .fetch_sub(1, Ordering::SeqCst);
-                    the_array.inner.data.team.dec_counters(1);
+                        .team
+                        .exec_arc_am_pe::<R>(
+                            pe,
+                            am,
+                            Some(the_array.inner.data.array_counters.clone()),
+                        )
+                        .into_future();
+                    // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
+                    let res_buffer =
+                        (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
+                    reqs.push(Box::pin(async move { (req.await, res_buffer) }));
+                    inner_start_i += val_len;
                 });
+                // println!("reqs len {:?}",reqs.len());
+                futures2.lock().extend(reqs);
+                cnt2.fetch_add(1, Ordering::SeqCst);
+                the_array
+                    .inner
+                    .data
+                    .array_counters
+                    .outstanding_reqs
+                    .fetch_sub(1, Ordering::SeqCst);
+                the_array.inner.data.team.dec_counters(1);
+            });
             start_i += len;
         }
+
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-            // async_std::task::yield_now().await;
         }
         // println!("futures len {:?}",futures.lock().len());
         Box::pin(async move {
@@ -639,7 +634,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                 .data
                 .team
                 .scheduler
-                .submit_immediate_task2(async move {
+                .submit_immediate_task(async move {
                     // println!("in immediate task");
                     let mut buffs = vec![Vec::with_capacity(bytes_per_batch); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
@@ -760,9 +755,9 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                 });
             start_i += len;
         }
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-            // async_std::task::yield_now().await;
         }
         // println!("futures len {:?}", futures.lock().len());
         Box::pin(async move {
diff --git a/src/barrier.rs b/src/barrier.rs
index 185ad304..1ee005fc 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -1,10 +1,7 @@
 use crate::lamellae::{AllocationType, Lamellae, LamellaeRDMA};
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::SchedulerQueue;
-// use crate::lamellar_memregion::{SharedMemoryRegion,RegisteredMemoryRegion};
-use crate::memregion::MemoryRegion; //, RTMemoryRegionRDMA, RegisteredMemoryRegion};
+use crate::memregion::MemoryRegion;
 use crate::scheduler::Scheduler;
-// use rand::prelude::SliceRandom;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::time::Instant;
@@ -17,7 +14,7 @@ pub(crate) struct Barrier {
     n: usize, // dissemination factor
     num_rounds: usize,
     pub(crate) arch: Arc<LamellarArchRT>,
-    pub(crate) _scheduler: Arc<Scheduler>,
+    pub(crate) scheduler: Arc<Scheduler>,
     lamellae: Arc<Lamellae>,
     barrier_cnt: AtomicUsize,
     barrier_buf: Vec<MemoryRegion<usize>>,
@@ -85,17 +82,17 @@ impl Barrier {
         };
 
         let bar = Barrier {
-            my_pe: my_pe,
-            num_pes: num_pes,
-            n: n,
-            num_rounds: num_rounds,
-            arch: arch,
-            _scheduler: scheduler,
-            lamellae: lamellae,
+            my_pe,
+            num_pes,
+            n,
+            num_rounds,
+            arch,
+            scheduler,
+            lamellae,
             barrier_cnt: AtomicUsize::new(1),
             barrier_buf: buffs,
-            send_buf: send_buf,
-            panic: panic,
+            send_buf,
+            panic,
         };
         // bar.print_bar();
         bar
@@ -274,7 +271,7 @@ impl Barrier {
         if std::thread::current().id() == *crate::MAIN_THREAD {
             self.barrier_internal(|| {
                 // std::thread::yield_now();
-                self._scheduler.exec_task();
+                self.scheduler.exec_task();
             });
         } else {
             if let Ok(val) = std::env::var("LAMELLAR_BARRIER_WARNING") {
@@ -293,7 +290,7 @@ impl Barrier {
     // we actually want to be able to process other tasks while the barrier is active
     pub(crate) fn tasking_barrier(&self) {
         self.barrier_internal(|| {
-            self._scheduler.exec_task();
+            self.scheduler.exec_task();
         });
     }
 
diff --git a/src/darc.rs b/src/darc.rs
index e8dc9e71..9f715137 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -64,7 +64,6 @@ use crate::barrier::Barrier;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
-// use crate::scheduler::SchedulerQueue;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 #[doc(hidden)]
@@ -137,8 +136,8 @@ pub struct DarcInner<T> {
     drop: Option<fn(&mut T)>,
     valid: AtomicBool,
 }
-unsafe impl<T: Send> Send for DarcInner<T> {}
-unsafe impl<T: Sync> Sync for DarcInner<T> {}
+unsafe impl<T> Send for DarcInner<T> {} //we cant create DarcInners without going through the Darc interface which enforces  Sync+Send
+unsafe impl<T> Sync for DarcInner<T> {} //we cant create DarcInners without going through the Darc interface which enforces  Sync+Send
 
 /// Distributed atomic reference counter
 ///
@@ -192,8 +191,8 @@ pub struct Darc<T: 'static> {
     inner: *mut DarcInner<T>,
     src_pe: usize,
 }
-unsafe impl<T: Send> Send for Darc<T> {}
-unsafe impl<T: Sync> Sync for Darc<T> {}
+unsafe impl<T: Sync + Send> Send for Darc<T> {}
+unsafe impl<T: Sync + Send> Sync for Darc<T> {}
 
 impl<T> LamellarEnv for Darc<T> {
     fn my_pe(&self) -> usize {
@@ -956,15 +955,11 @@ impl<T> Darc<T> {
         Ok(d)
     }
 
-    pub(crate) async fn block_on_outstanding(&self, state: DarcMode, extra_cnt: usize) {
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
-            },
-            state,
-            extra_cnt,
-        )
-        .await;
+    pub(crate) async fn block_on_outstanding(self, state: DarcMode, extra_cnt: usize) {
+        let wrapped = WrappedInner {
+            inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
+        };
+        DarcInner::block_on_outstanding(wrapped, state, extra_cnt).await;
     }
 
     #[doc(alias = "Collective")]
@@ -1000,9 +995,10 @@ impl<T> Darc<T> {
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
         inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
         // println! {"[{:?}] darc[{:?}] into_localrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
-        let item = unsafe { Box::from_raw(inner.item as *mut T) };
+        let item = unsafe { *Box::from_raw(inner.item as *mut T) };
+
         let d = Darc {
-            inner: self.inner as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+            inner: self.inner as *mut DarcInner<Arc<RwLock<T>>>,
             src_pe: self.src_pe,
         };
         d.inner_mut()
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 2bda5a9b..cbb5cbaa 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -423,8 +423,8 @@ pub struct GlobalRwDarc<T: 'static> {
     pub(crate) darc: Darc<DistRwLock<T>>,
 }
 
-unsafe impl<T: Send> Send for GlobalRwDarc<T> {}
-unsafe impl<T: Sync> Sync for GlobalRwDarc<T> {}
+unsafe impl<T: Send> Send for GlobalRwDarc<T> {} //protected internally by rwlock
+unsafe impl<T: Send> Sync for GlobalRwDarc<T> {} //protected internally by rwlock
 
 impl<T> LamellarEnv for GlobalRwDarc<T> {
     fn my_pe(&self) -> usize {
@@ -547,13 +547,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let guard = world.block_on(counter.read());
-    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    /// drop(guard); //release the
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let guard = counter.read().await;
+    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///     drop(guard); //release the
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn read(&self) -> GlobalRwDarcReadGuard<T> {
         // println!("async read");
@@ -619,13 +621,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.write());
-    /// *guard += my_pe;
-    /// drop(guard); //release the
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.write().await;
+    ///     *guard += my_pe;
+    ///     drop(guard); //release the
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn write(&self) -> GlobalRwDarcWriteGuard<T> {
         // println!("async write");
@@ -688,13 +692,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.collective_write());
-    /// *guard += my_pe;
-    /// drop(guard); //release the lock
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.collective_write().await;
+    ///     *guard += my_pe;
+    ///     drop(guard); //release the lock
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
         // println!("async write");
@@ -723,182 +729,183 @@ impl<T> GlobalRwDarc<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while any writer currently has access to the lock, but there may be other readers
-    // ///
-    // /// Returns ared this specific instance of the read lock will only be held by the calling PE (until it is dropped)
-    // /// Other PEs may have separately aquired read locks as well.
-    // ///
-    // ///
-    // /// # Noten RAII guard which will drop the read access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    // /// Once aqui
-    // /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_read]
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// // do interesting work
-    // /// let guard = counter.read(); //blocks current thread until aquired
-    // /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    // ///```
-    // pub fn read(&self) -> GlobalRwDarcReadGuard<T> {
-    //     // println!("read");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::Read,
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcReadGuard {
-    //         rwlock: self.darc.clone(),
-    //         marker: PhantomData,
-    //         local_cnt: Arc::new(AtomicUsize::new(1)),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while any writer currently has access to the lock, but there may be other readers
+    ///
+    /// Returns ared this specific instance of the read lock will only be held by the calling PE (until it is dropped)
+    /// Other PEs may have separately aquired read locks as well.
+    ///
+    ///
+    /// # Noten RAII guard which will drop the read access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
+    /// Once aqui
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_read]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// // do interesting work
+    /// let guard = counter.blocking_read(); //blocks current thread until aquired
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///```
+    pub fn blocking_read(&self) -> GlobalRwDarcReadGuard<T> {
+        // println!("read");
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Launches an active message to gather a global write lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    // /// Once aquired the lock will only be held by the calling PE (until it is dropped)
-    // ///
-    // /// # Note
-    // /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_write]
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// // do interesting work
-    // /// let mut guard = counter.write(); //blocks current thread until aquired
-    // /// *guard += my_pe;
-    // ///```
-    // pub fn write(&self) -> GlobalRwDarcWriteGuard<T> {
-    //     // println!("write");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::Write,
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcWriteGuard {
-    //         rwlock: self.darc.clone(),
-    //         marker: PhantomData,
-    //     }
-    //     // inner.item().write(remote_rwlock_addr)
-    // }
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::Read,
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcReadGuard {
+            rwlock: self.darc.clone(),
+            marker: PhantomData,
+            local_cnt: Arc::new(AtomicUsize::new(1)),
+        }
+    }
 
-    // #[doc(alias("Collective"))]
-    // /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current task will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # Collective Operation
-    // /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::*;
-    // ///
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let mut counter = self.counter.async_write().await; // await until we get the write lock
-    // ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let mut guard = world.block_on(counter.collective_write());
-    // /// *guard += my_pe;
-    // /// drop(guard); //release the lock
-    // /// world.wait_all(); // wait for my active message to return
-    // /// world.barrier(); //at this point all updates will have been performed
-    // ///```
-    // pub fn collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-    //     // println!("async write");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::CollectiveWrite(collective_cnt),
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcCollectiveWriteGuard {
-    //         rwlock: self.darc.clone(),
-    //         collective_cnt: collective_cnt,
-    //         marker: PhantomData,
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Launches an active message to gather a global write lock associated with this GlobalRwDarc.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
+    /// Once aquired the lock will only be held by the calling PE (until it is dropped)
+    ///
+    /// # Note
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_write]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// // do interesting work
+    /// let mut guard = counter.blocking_write(); //blocks current thread until aquired
+    /// *guard += my_pe;
+    ///```
+    pub fn blocking_write(&self) -> GlobalRwDarcWriteGuard<T> {
+        // println!("write");
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::Write,
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcWriteGuard {
+            rwlock: self.darc.clone(),
+            marker: PhantomData,
+        }
+        // inner.item().write(remote_rwlock_addr)
+    }
+
+    #[doc(alias("Collective"))]
+    /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
+    ///
+    /// The current task will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # Collective Operation
+    /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::*;
+    ///
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let mut counter = self.counter.async_write().await; // await until we get the write lock
+    ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
+    ///     }
+    ///  }
+    /// //-------------
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// world.exec_am_all(DarcAm {counter: counter.clone()});
+    /// let mut guard = counter.blocking_collective_write();
+    /// *guard += my_pe;
+    /// drop(guard); //release the lock
+    /// world.wait_all(); // wait for my active message to return
+    /// world.barrier(); //at this point all updates will have been performed
+    ///```
+    pub fn blocking_collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
+        // println!("async write");
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::CollectiveWrite(collective_cnt),
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcCollectiveWriteGuard {
+            rwlock: self.darc.clone(),
+            collective_cnt: collective_cnt,
+            marker: PhantomData,
+        }
+    }
 }
 
 impl<T> GlobalRwDarc<T> {
@@ -1025,14 +1032,12 @@ impl<T> GlobalRwDarc<T> {
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
         let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<T>>>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
         d.inner_mut()
-            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(Box::new(
-                item,
-            ))))));
+            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
         LocalRwDarc { darc: d }
     }
 }
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 26557efb..f6b4c9e3 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -14,7 +14,6 @@ use crate::darc::global_rw_darc::{DistRwLock, GlobalRwDarc};
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
 use crate::lamellar_team::IntoLamellarTeam;
-use crate::scheduler::SchedulerQueue;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 /// A local read-write `Darc`
@@ -34,11 +33,11 @@ pub struct LocalRwDarc<T: 'static> {
         serialize_with = "localrw_serialize2",
         deserialize_with = "localrw_from_ndarc2"
     )]
-    pub(crate) darc: Darc<Arc<RwLock<Box<T>>>>, //we need to wrap WrLock in an Arc so we get access to ArcReadGuard and ArcWriteGuard
+    pub(crate) darc: Darc<Arc<RwLock<T>>>, //we need to wrap WrLock in an Arc so we get access to ArcReadGuard and ArcWriteGuard
 }
 
-unsafe impl<T: Send> Send for LocalRwDarc<T> {}
-unsafe impl<T: Sync> Sync for LocalRwDarc<T> {}
+unsafe impl<T: Send> Send for LocalRwDarc<T> {} //we are protecting internally with an WrLock
+unsafe impl<T: Send> Sync for LocalRwDarc<T> {} //we are protecting internally with an WrLock
 
 impl<T> LamellarEnv for LocalRwDarc<T> {
     fn my_pe(&self) -> usize {
@@ -84,7 +83,7 @@ impl<T> crate::active_messaging::DarcSerde for LocalRwDarc<T> {
 }
 
 impl<T> LocalRwDarc<T> {
-    fn inner(&self) -> &DarcInner<Arc<RwLock<Box<T>>>> {
+    fn inner(&self) -> &DarcInner<Arc<RwLock<T>>> {
         self.darc.inner()
     }
 
@@ -123,67 +122,10 @@ impl<T> LocalRwDarc<T> {
             self.inner()
         );
     }
+}
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Aquires a reader lock of this LocalRwDarc local to this PE.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while any writer currentl has access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the read access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    // ///
-    // /// # Note
-    // /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::prelude::*;
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let counter = self.counter.read(); //block until we get the write lock
-    // ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let guard = counter.read();
-    // /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    // ///```
-    // pub fn read(&self) -> RwLockReadGuardArc<Box<T>> {
-    //     // println!("trying to get read lock");
-    //     match self.darc.try_read_arc() {
-    //         Some(guard) => {
-    //             // println!("got read lock");
-    //             guard
-    //         }
-    //         None => {
-    //             // println!("did not get read lock");
-    //             let _lock_fut = self.darc.read_arc();
-    //             self.darc.team().scheduler.block_on(async move {
-    //                 // println!("async trying to get read lock");
-    //                 _lock_fut.await
-    //             })
-    //         }
-    //     }
-    // }
-
+impl<T: Sync + Send> LocalRwDarc<T> {
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
     /// Aquires a reader lock of this LocalRwDarc local to this PE.
     ///
     /// The current THREAD will be blocked until the lock has been acquired.
@@ -211,7 +153,7 @@ impl<T> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let counter = self.counter.read().await; //block until we get the write lock
+    ///         let counter = self.counter.read(); //block until we get the write lock
     ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
     ///     }
     ///  }
@@ -220,76 +162,67 @@ impl<T> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let guard = world.block_on(counter.read());
+    /// let guard = counter.blocking_read();
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
-    pub async fn read(&self) -> RwLockReadGuardArc<Box<T>> {
+    pub fn blocking_read(&self) -> RwLockReadGuardArc<T> {
+        let self_clone: LocalRwDarc<T> = self.clone();
+        self.darc
+            .team()
+            .block_on(async move { self_clone.darc.read_arc().await })
+    }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// TODO: UPDATE
+    /// Aquires a reader lock of this LocalRwDarc local to this PE.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while any writer currentl has access to the lock
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
+    ///
+    /// # Note
+    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::prelude::*;
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let counter = self.counter.read().await; //block until we get the write lock
+    ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
+    ///     }
+    ///  }
+    /// //-------------
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
+    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let guard = counter.read().await;
+    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    /// });
+    ///```
+    pub async fn read(&self) -> RwLockReadGuardArc<T> {
         // println!("async trying to get read lock");
         let lock = self.darc.read_arc().await;
         // println!("got async read lock");
         lock
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Aquires the writer lock of this LocalRwDarc local to this PE.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    // ///
-    // /// # Note
-    // /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::prelude::*;
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let mut counter = self.counter.write(); //block until we get the write lock
-    // ///         **counter += 1;
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let mut guard = counter.write();
-    // /// **guard += my_pe;
-    // ///```
-    // pub fn write(&self) -> RwLockWriteGuardArc<Box<T>> {
-    //     // println!("trying to get write lock");
-    //     match self.darc.try_write_arc() {
-    //         Some(guard) => {
-    //             // println!("got write lock");
-    //             guard
-    //         }
-    //         None => {
-    //             // println!("did not get write lock");
-    //             let lock_fut = self.darc.write_arc();
-    //             self.darc.team().scheduler.block_on(async move {
-    //                 // println!("async trying to get write lock");
-    //                 lock_fut.await
-    //             })
-    //         }
-    //     }
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
     /// Aquires the writer lock of this LocalRwDarc local to this PE.
     ///
     /// The current THREAD will be blocked until the lock has been acquired.
@@ -317,7 +250,7 @@ impl<T> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let mut counter = self.counter.write().await; //block until we get the write lock
+    ///         let mut counter = self.counter.write(); //block until we get the write lock
     ///         **counter += 1;
     ///     }
     ///  }
@@ -326,10 +259,61 @@ impl<T> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.write());
+    /// let mut guard = counter.blocking_write();
     /// **guard += my_pe;
     ///```
-    pub async fn write(&self) -> RwLockWriteGuardArc<Box<T>> {
+    pub fn blocking_write(&self) -> RwLockWriteGuardArc<T> {
+        // println!("trying to get write lock");
+        let self_clone: LocalRwDarc<T> = self.clone();
+        self.darc
+            .team()
+            .block_on(async move { self_clone.darc.write_arc().await })
+    }
+
+    #[doc(alias("One-sided", "onesided"))]
+    ///
+    /// Aquires the writer lock of this LocalRwDarc local to this PE.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
+    ///
+    /// # Note
+    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::prelude::*;
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let mut counter = self.counter.write().await; //block until we get the write lock
+    ///         **counter += 1;
+    ///     }
+    ///  }
+    /// //-------------
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone()block_on(async move{
+    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.write();
+    ///     **guard += my_pe;
+    /// })
+    ///```
+    pub async fn write(&self) -> RwLockWriteGuardArc<T> {
         // println!("async trying to get write lock");
         let lock = self.darc.write_arc().await;
         // println!("got async write lock");
@@ -359,11 +343,7 @@ impl<T> LocalRwDarc<T> {
     /// ```
     pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> Result<LocalRwDarc<T>, IdError> {
         Ok(LocalRwDarc {
-            darc: Darc::try_new(
-                team,
-                Arc::new(RwLock::new(Box::new(item))),
-                DarcMode::LocalRw,
-            )?,
+            darc: Darc::try_new(team, Arc::new(RwLock::new(item)), DarcMode::LocalRw)?,
         })
     }
 
@@ -378,12 +358,12 @@ impl<T> LocalRwDarc<T> {
     // }
 
     #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a regular [Darc]
+    /// Converts this LocalRwDarc into a [GlobalRwDarc]
     ///
     /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
     ///
     /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
@@ -395,9 +375,9 @@ impl<T> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = five.into_darc();
+    /// let five_as_globaldarc = five.into_globalrw();
     /// ```
-    pub fn into_darc(self) -> Darc<T> {
+    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -406,37 +386,40 @@ impl<T> LocalRwDarc<T> {
                 inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                     .expect("invalid darc pointer"),
             },
-            DarcMode::Darc,
+            DarcMode::GlobalRw,
             0,
         ));
         // println!("after block on outstanding");
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
-        let mut arc_item =
-            unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<Box<T>>>)).clone() };
-
-        let item: Box<T> = loop {
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+        let item: T = loop {
             arc_item = match Arc::try_unwrap(arc_item) {
                 Ok(item) => break item.into_inner(),
                 Err(arc_item) => arc_item,
             };
         };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<T>,
+            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
-        d.inner_mut().update_item(Box::into_raw(item));
-        d
+        d.inner_mut()
+            .update_item(Box::into_raw(Box::new(DistRwLock::new(
+                item,
+                self.inner().team(),
+            ))));
+        GlobalRwDarc { darc: d }
     }
+}
 
+impl<T: Send + Sync> LocalRwDarc<T> {
     #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a [GlobalRwDarc]
+    /// Converts this LocalRwDarc into a regular [Darc]
     ///
     /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
     ///
     /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
@@ -448,9 +431,9 @@ impl<T> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = five.into_globalrw();
+    /// let five_as_darc = five.into_darc();
     /// ```
-    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
+    pub fn into_darc(self) -> Darc<T> {
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -459,30 +442,27 @@ impl<T> LocalRwDarc<T> {
                 inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                     .expect("invalid darc pointer"),
             },
-            DarcMode::GlobalRw,
+            DarcMode::Darc,
             0,
         ));
         // println!("after block on outstanding");
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let mut arc_item =
-            unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<Box<T>>>)).clone() };
-        let item: Box<T> = loop {
+                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+
+        let item: T = loop {
             arc_item = match Arc::try_unwrap(arc_item) {
                 Ok(item) => break item.into_inner(),
                 Err(arc_item) => arc_item,
             };
         };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
+            inner: self.darc.inner as *mut DarcInner<T>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(DistRwLock::new(
-                *item,
-                self.inner().team(),
-            ))));
-        GlobalRwDarc { darc: d }
+        d.inner_mut().update_item(Box::into_raw(Box::new(item))); //the darc will free this approriately
+        d
     }
 }
 
@@ -495,9 +475,17 @@ impl<T> Clone for LocalRwDarc<T> {
     }
 }
 
-impl<T: fmt::Display> fmt::Display for LocalRwDarc<T> {
+impl<T: fmt::Display + Sync + Send> fmt::Display for LocalRwDarc<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(&**self.darc.team().scheduler.block_on(self.read()), f)
+        let lock: LocalRwDarc<T> = self.clone();
+        fmt::Display::fmt(
+            &self
+                .darc
+                .team()
+                .scheduler
+                .block_on(async move { lock.read().await }),
+            f,
+        )
     }
 }
 
@@ -525,10 +513,7 @@ impl<T: fmt::Display> fmt::Display for LocalRwDarc<T> {
 // }
 
 #[doc(hidden)]
-pub fn localrw_serialize2<S, T>(
-    localrw: &Darc<Arc<RwLock<Box<T>>>>,
-    s: S,
-) -> Result<S::Ok, S::Error>
+pub fn localrw_serialize2<S, T>(localrw: &Darc<Arc<RwLock<T>>>, s: S) -> Result<S::Ok, S::Error>
 where
     S: Serializer,
 {
@@ -539,9 +524,7 @@ where
 }
 
 #[doc(hidden)]
-pub fn localrw_from_ndarc2<'de, D, T>(
-    deserializer: D,
-) -> Result<Darc<Arc<RwLock<Box<T>>>>, D::Error>
+pub fn localrw_from_ndarc2<'de, D, T>(deserializer: D) -> Result<Darc<Arc<RwLock<T>>>, D::Error>
 where
     D: Deserializer<'de>,
 {
@@ -555,8 +538,8 @@ where
     Ok(Darc::from(ndarc))
 }
 
-// impl<T> From<Darc<Arc<RwLock<Box<T>>>>> for __NetworkDarc {
-//     fn from(darc: Darc<Arc<RwLock<Box<T>>>>) -> Self {
+// impl<T> From<Darc<Arc<RwLock<T>>>> for __NetworkDarc {
+//     fn from(darc: Darc<Arc<RwLock<T>>>) -> Self {
 //         // println!("rwdarc to net darc");
 //         // darc.print();
 //         let team = &darc.inner().team();
@@ -570,8 +553,8 @@ where
 //     }
 // }
 
-// impl<T> From<&Darc<Arc<RwLock<Box<T>>>>> for __NetworkDarc {
-//     fn from(darc: &Darc<Arc<RwLock<Box<T>>>>) -> Self {
+// impl<T> From<&Darc<Arc<RwLock<T>>>> for __NetworkDarc {
+//     fn from(darc: &Darc<Arc<RwLock<T>>>) -> Self {
 //         // println!("rwdarc to net darc");
 //         // darc.print();
 //         let team = &darc.inner().team();
@@ -585,14 +568,14 @@ where
 //     }
 // }
 
-// impl<T> From<__NetworkDarc> for Darc<Arc<RwLock<Box<T>>>> {
+// impl<T> From<__NetworkDarc> for Darc<Arc<RwLock<T>>> {
 //     fn from(ndarc: __NetworkDarc) -> Self {
 //         // println!("rwdarc from net darc");
 
 //         if let Some(lamellae) = LAMELLAES.read().get(&ndarc.backend) {
 //             let darc = Darc {
 //                 inner: lamellae.local_addr(ndarc.orig_world_pe, ndarc.inner_addr)
-//                     as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+//                     as *mut DarcInner<Arc<RwLock<T>>>,
 //                 src_pe: ndarc.orig_team_pe,
 //                 // phantom: PhantomData,
 //             };
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 76b02bcd..dbdff509 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -2,7 +2,7 @@ use crate::lamellae::comm::*;
 use crate::lamellae::{
     Des, Lamellae, LamellaeComm, LamellaeRDMA, SerializedData, SerializedDataOps,
 };
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 
 use parking_lot::Mutex;
 
@@ -1434,7 +1434,8 @@ impl CommandQueue {
                                                 //     "[{:?}] recv_data submitting work",
                                                 //     std::thread::current().id(),
                                                 // );
-                                                scheduler2.submit_work(work_data, lamellae.clone());
+                                                scheduler2
+                                                    .submit_remote_am(work_data, lamellae.clone());
                                                 if cmd_cnt_clone.fetch_sub(1, Ordering::SeqCst) == 1
                                                 {
                                                     cq.send_free(src, cmd_buf_cmd);
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index ca76dc34..37bbcb2f 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -6,7 +6,7 @@ use crate::lamellae::{
     LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
diff --git a/src/lamellae/shmem_lamellae.rs b/src/lamellae/shmem_lamellae.rs
index 49e50716..b4008bcf 100644
--- a/src/lamellae/shmem_lamellae.rs
+++ b/src/lamellae/shmem_lamellae.rs
@@ -7,7 +7,7 @@ use crate::lamellae::{
     LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 58cee47a..81a6b317 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -3,7 +3,7 @@ use crate::darc::Darc;
 use crate::lamellae::{Des, SerializedData};
 use crate::lamellar_arch::LamellarArchRT;
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use async_trait::async_trait;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 37c64972..26e45ec0 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -4,7 +4,7 @@ use crate::lamellar_arch::LamellarArchRT;
 use crate::lamellar_request::*;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::{ReqId, Scheduler, SchedulerQueue};
+use crate::scheduler::{ReqId, Scheduler};
 use crate::Darc;
 
 use crate::active_messaging::registered_active_message::{AmId, AMS_EXECS, AMS_IDS, AM_ID_START};
@@ -463,10 +463,7 @@ impl ActiveMessaging for LamellarTaskGroup {
         self.exec_am_local_inner(am).into_future()
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         tracing::trace_span!("block_on").in_scope(|| self.team.scheduler.block_on(f))
     }
 }
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index b819ac02..d238e5de 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -9,7 +9,7 @@ use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, LamellarMemoryRegion,
     MemoryRegion, RemoteMemoryRegion,
 };
-use crate::scheduler::{ReqId, Scheduler, SchedulerQueue};
+use crate::scheduler::{ReqId, Scheduler};
 #[cfg(feature = "nightly")]
 use crate::utils::ser_closure;
 
@@ -485,10 +485,7 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.barrier();
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         trace_span!("block_on").in_scope(|| self.team.scheduler.block_on(f))
@@ -923,7 +920,7 @@ impl LamellarTeamRT {
             // what does it mean if we drop a parent team while a sub_team is valid?
             if let None = &self.parent {
                 // println!("shutdown lamellae, going to shutdown scheduler");
-                self.scheduler.shutdown_threads();
+                self.scheduler.begin_shutdown();
                 self.put_dropped();
                 self.drop_barrier();
                 self.lamellae.shutdown();
@@ -1324,7 +1321,8 @@ impl LamellarTeamRT {
 
     pub(crate) fn block_on<F>(&self, f: F) -> F::Output
     where
-        F: Future,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index f8116bf0..f3d7726d 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -6,7 +6,7 @@ use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, RemoteMemoryRegion,
 };
-use crate::scheduler::{create_scheduler, SchedulerQueue, SchedulerType};
+use crate::scheduler::{create_scheduler, ExecutorType};
 // use log::trace;
 
 use tracing::*;
@@ -75,10 +75,7 @@ impl ActiveMessaging for LamellarWorld {
         self.team.barrier();
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         trace_span!("block_on").in_scope(|| self.team_rt.scheduler.block_on(f))
     }
 }
@@ -325,7 +322,7 @@ impl Drop for LamellarWorld {
 /// # Examples
 ///
 ///```
-/// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+/// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
 /// // can also use and of the module preludes
 /// // use lamellar::active_messaging::prelude::*;
 /// // use lamellar::array::prelude::*;
@@ -334,14 +331,14 @@ impl Drop for LamellarWorld {
 ///
 /// let world = LamellarWorldBuilder::new()
 ///                             .with_lamellae(Backend::Local)
-///                             .with_scheduler(SchedulerType::WorkStealing)
+///                             .with_executor(ExecutorType::LamellarWorkStealing)
 ///                             .build();
 ///```
 #[derive(Debug)]
 pub struct LamellarWorldBuilder {
     primary_lamellae: Backend,
     // secondary_lamellae: HashSet<Backend>,
-    scheduler: SchedulerType,
+    executor: ExecutorType,
     num_threads: usize,
 }
 
@@ -357,7 +354,7 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
     /// // can also use and of the module preludes
     /// // use lamellar::active_messaging::prelude::*;
     /// // use lamellar::array::prelude::*;
@@ -366,30 +363,35 @@ impl LamellarWorldBuilder {
     ///
     /// let world = LamellarWorldBuilder::new()
     ///                             .with_lamellae(Backend::Local)
-    ///                             .with_scheduler(SchedulerType::WorkStealing)
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing)
     ///                             .build();
     ///```
     #[tracing::instrument(skip_all)]
     pub fn new() -> LamellarWorldBuilder {
         // simple_logger::init().unwrap();
         // trace!("New world builder");
-        let scheduler = match std::env::var("LAMELLAR_SCHEDULER") {
+        let mut executor = match std::env::var("LAMELLAR_EXECUTOR") {
             Ok(val) => {
-                let scheduler = val.parse::<usize>().unwrap();
-                if scheduler == 0 {
-                    SchedulerType::WorkStealing
+                let executor = val.parse::<usize>().unwrap();
+                if executor == 0 {
+                    ExecutorType::LamellarWorkStealing
                 }
                 // else if scheduler == 1 {
-                //     SchedulerType::NumaWorkStealing
+                //     ExecutorType::NumaWorkStealing
                 // } else if scheduler == 2 {
-                //     SchedulerType::NumaWorkStealing2
+                //     ExecutorType::NumaWorkStealing2
                 // }
                 else {
-                    SchedulerType::WorkStealing
+                    ExecutorType::LamellarWorkStealing
                 }
             }
-            Err(_) => SchedulerType::WorkStealing,
+            Err(_) => ExecutorType::LamellarWorkStealing,
         };
+        #[cfg(feature = "tokio-executor")]
+        {
+            executor = ExecutorType::Tokio;
+        }
+
         let num_threads = match std::env::var("LAMELLAR_THREADS") {
             Ok(n) => {
                 if let Ok(num_threads) = n.parse::<usize>() {
@@ -409,7 +411,7 @@ impl LamellarWorldBuilder {
         LamellarWorldBuilder {
             primary_lamellae: Default::default(),
             // secondary_lamellae: HashSet::new(),
-            scheduler: scheduler,
+            executor: executor,
             num_threads: num_threads,
         }
     }
@@ -442,24 +444,24 @@ impl LamellarWorldBuilder {
     // }
 
     #[doc(alias = "Collective")]
-    /// Specify the scheduler to use for this execution
+    /// Specify the executor to use for this execution
     ///
     /// # Collective Operation
-    /// While simply calling `with_scheduler` is not collective by itself (i.e. there is no internal barrier that would deadlock,
+    /// While simply calling `with_executor` is not collective by itself (i.e. there is no internal barrier that would deadlock,
     /// as the remote fabric is not initiated until after a call to `build`), it is necessary that the same
     /// parameters are used by all PEs that will exist in the world.
     ///
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,ExecutorType};
     ///
     /// let builder = LamellarWorldBuilder::new()
-    ///                             .with_scheduler(SchedulerType::WorkStealing);
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing);
     ///```
     #[tracing::instrument(skip_all)]
-    pub fn with_scheduler(mut self, sched: SchedulerType) -> LamellarWorldBuilder {
-        self.scheduler = sched;
+    pub fn with_executor(mut self, sched: ExecutorType) -> LamellarWorldBuilder {
+        self.executor = sched;
         self
     }
 
@@ -473,7 +475,7 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,ExecutorType};
     ///
     /// let builder = LamellarWorldBuilder::new()
     ///                             .set_num_workers(10);
@@ -493,11 +495,11 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
     ///
     /// let world = LamellarWorldBuilder::new()
     ///                             .with_lamellae(Backend::Local)
-    ///                             .with_scheduler(SchedulerType::WorkStealing)
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing)
     ///                             .build();
     ///```
     #[tracing::instrument(skip_all)]
@@ -518,14 +520,15 @@ impl LamellarWorldBuilder {
         // println!("{:?}: init_fabric", timer.elapsed());
 
         // timer = std::time::Instant::now();
+
+        // we delay building the scheduler until we know the number of PEs (which is used for message aggregation)
+        // this could be lazyily provided but this is easy enough to do here
         let panic = Arc::new(AtomicU8::new(0));
         let sched_new = Arc::new(create_scheduler(
-            self.scheduler,
+            self.executor,
             num_pes,
             self.num_threads,
             panic.clone(),
-            my_pe,
-            // teams.clone(),
         ));
         // println!("{:?}: create_scheduler", timer.elapsed());
 
diff --git a/src/lib.rs b/src/lib.rs
index 36f74447..de0b420c 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -294,7 +294,7 @@ pub use crate::lamellar_team::LamellarTeam;
 #[doc(hidden)]
 pub use crate::lamellar_team::{ArcLamellarTeam, LamellarTeamRT};
 pub use crate::lamellar_world::*;
-pub use crate::scheduler::SchedulerType;
+pub use crate::scheduler::ExecutorType;
 
 extern crate lamellar_impl;
 #[doc(hidden)]
diff --git a/src/scheduler.rs b/src/scheduler.rs
index c07ab9d1..97d85179 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -1,13 +1,33 @@
+use crate::active_messaging::batching::simple_batcher::SimpleBatcher;
+use crate::active_messaging::batching::team_am_batcher::TeamAmBatcher;
+use crate::active_messaging::batching::BatcherType;
+use crate::active_messaging::registered_active_message::RegisteredActiveMessages;
 use crate::active_messaging::*;
-use crate::lamellae::{Lamellae, SerializedData};
+use crate::lamellae::{Des, Lamellae, SerializedData};
 
 use enum_dispatch::enum_dispatch;
 use futures::Future;
-use std::sync::atomic::AtomicU8;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 
 pub(crate) mod work_stealing;
-use work_stealing::{WorkStealing, WorkStealingInner};
+use work_stealing::WorkStealing;
+
+#[cfg(feature = "tokio-executor")]
+pub(crate) mod tokio;
+#[cfg(feature = "tokio-executor")]
+use tokio::TokioRt;
+
+// ACTIVE ENUM
+// since atomic enums would be another dependecy
+
+#[repr(u8)]
+#[derive(Copy, Clone, Debug, serde::Serialize, serde::Deserialize)]
+pub(crate) enum SchedulerStatus {
+    Active,
+    Finished,
+    Panic,
+}
 
 // pub(crate) mod numa_work_stealing;
 // use numa_work_stealing::{NumaWorkStealing, NumaWorkStealingInner};
@@ -31,115 +51,254 @@ pub(crate) struct ReqId {
     pub(crate) sub_id: usize,
 }
 
-/// The available worker thread scheduling algorithms
 #[derive(Debug)]
-pub enum SchedulerType {
-    /// The default (and currently only) scheduler, performs workstealing across all worker threads
-    WorkStealing,
-    // NumaWorkStealing,
-    // NumaWorkStealing2,
+pub enum ExecutorType {
+    LamellarWorkStealing,
+    #[cfg(feature = "tokio-executor")]
+    Tokio,
+    // Dyn(impl LamellarExecutor),
 }
 
-#[enum_dispatch(AmeSchedulerQueue)]
-#[derive(Debug)]
-pub(crate) enum AmeScheduler {
-    WorkStealingInner,
-    // NumaWorkStealingInner,
-    // NumaWorkStealing2Inner,
-}
 #[enum_dispatch]
-pub(crate) trait AmeSchedulerQueue {
-    fn submit_am(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    );
-    fn submit_am_immediate(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    );
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        msg: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ); //serialized active message
+pub(crate) trait LamellarExecutor {
     fn submit_task<F>(&self, future: F)
     where
-        F: Future<Output = ()> + Send + 'static;
+        F: Future + Send + 'static,
+        F::Output: Send;
+
     fn submit_immediate_task<F>(&self, future: F)
     where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn exec_task(&self);
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        Self::submit_task(self, future)
+    }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future;
+    fn exec_task(&self) {
+        std::thread::yield_now();
+    }
+
+    fn block_on<F: Future>(&self, future: F) -> F::Output;
 
+    fn set_max_workers(&mut self, num_workers: usize);
+    fn num_workers(&self) -> usize;
     fn shutdown(&self);
-    fn shutdown_threads(&self);
     fn force_shutdown(&self);
-    fn active(&self) -> bool;
 }
 
-#[enum_dispatch(SchedulerQueue)]
+#[enum_dispatch(LamellarExecutor)]
 #[derive(Debug)]
-pub(crate) enum Scheduler {
-    WorkStealing(Arc<WorkStealing>),
-    // NumaWorkStealing,
-    // NumaWorkStealing2,
+pub(crate) enum Executor {
+    WorkStealing(WorkStealing),
+    #[cfg(feature = "tokio-executor")]
+    Tokio(TokioRt),
 }
-#[enum_dispatch]
-pub(crate) trait SchedulerQueue {
-    fn submit_am(&self, am: Am); //serialized active message
-    fn submit_am_immediate(&self, am: Am); //serialized active message
-    fn submit_work(&self, msg: SerializedData, lamellae: Arc<Lamellae>); //serialized active message
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_immediate_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_task_node<F>(&self, future: F, node: usize)
+
+#[derive(Debug)]
+pub(crate) struct Scheduler {
+    executor: Arc<Executor>,
+    active_message_engine: RegisteredActiveMessages, //we can eventually abstract this around the ActiveMessageEngine trait but no need currently
+    num_ams: Arc<AtomicUsize>,
+    max_ams: Arc<AtomicUsize>,
+    num_tasks: Arc<AtomicUsize>,
+    max_tasks: Arc<AtomicUsize>,
+    am_stall_mark: Arc<AtomicUsize>,
+    status: Arc<AtomicU8>,
+    panic: Arc<AtomicU8>,
+}
+
+impl Scheduler {
+    pub(crate) fn new(
+        executor: Executor,
+        active_message_engine: RegisteredActiveMessages,
+        am_stall_mark: Arc<AtomicUsize>,
+        status: Arc<AtomicU8>,
+        panic: Arc<AtomicU8>,
+    ) -> Self {
+        Self {
+            executor: Arc::new(executor),
+            active_message_engine,
+            num_ams: Arc::new(AtomicUsize::new(0)),
+            max_ams: Arc::new(AtomicUsize::new(0)),
+            num_tasks: Arc::new(AtomicUsize::new(0)),
+            max_tasks: Arc::new(AtomicUsize::new(0)),
+            am_stall_mark,
+            status,
+            panic,
+        }
+    }
+    pub(crate) fn submit_am(&self, am: Am) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            ame.process_msg(am, executor, am_stall_mark, false).await;
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_task(am_future);
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn submit_am_immediate(&self, am: Am) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            ame.process_msg(am, executor, am_stall_mark, false).await;
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_immediate_task(am_future);
+    }
+
+    pub(crate) fn submit_remote_am(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            if let Some(header) = data.deserialize_header() {
+                let msg = header.msg;
+                ame.exec_msg(msg, data, lamellae, executor).await;
+            } else {
+                data.print();
+                panic!("should i be here?");
+            }
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_task(am_future);
+    }
+
+    pub(crate) fn submit_task<F>(&self, task: F)
     where
-        F: Future<Output = ()> + Send + 'static;
-    fn exec_task(&self);
-    fn block_on<F>(&self, future: F) -> F::Output
+        F: Future<Output = ()> + Send + 'static,
+    {
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        let future = async move {
+            num_tasks.fetch_add(1, Ordering::Relaxed);
+            max_tasks.fetch_add(1, Ordering::Relaxed);
+            task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+        };
+        self.executor.submit_task(future);
+    }
+
+    pub(crate) fn submit_immediate_task<F>(&self, task: F)
     where
-        F: Future;
-    fn shutdown(&self);
-    fn shutdown_threads(&self);
-    fn force_shutdown(&self);
-    fn active(&self) -> bool;
-    fn num_workers(&self) -> usize;
+        F: Future<Output = ()> + Send + 'static,
+    {
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        let future = async move {
+            num_tasks.fetch_add(1, Ordering::Relaxed);
+            max_tasks.fetch_add(1, Ordering::Relaxed);
+            task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+        };
+        self.executor.submit_immediate_task(future);
+    }
+
+    pub(crate) fn exec_task(&self) {
+        if std::thread::current().id() == *crate::MAIN_THREAD {
+            self.executor.exec_task();
+        } else {
+            std::thread::yield_now();
+        }
+    }
+
+    pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
+        self.executor.block_on(task)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn get_executor(&self) -> Arc<Executor> {
+        self.executor.clone()
+    }
+
+    pub(crate) fn active(&self) -> bool {
+        self.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+            || self.num_tasks.load(Ordering::SeqCst) > 3 // the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
+    }
+    pub(crate) fn num_workers(&self) -> usize {
+        self.executor.num_workers()
+    }
+    pub(crate) fn begin_shutdown(&self) {
+        self.status
+            .store(SchedulerStatus::Finished as u8, Ordering::SeqCst);
+    }
+    pub(crate) fn shutdown(&self) {
+        let mut timer = std::time::Instant::now();
+        while self.panic.load(Ordering::SeqCst) == 0 && self.num_tasks.load(Ordering::Relaxed) > 3
+        //TODO maybe this should be > 2
+        {
+            //the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
+            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                println!(
+                    "shurtdown timeout, tasks remaining: {:?} panic: {:?}",
+                    self.num_tasks.load(Ordering::Relaxed),
+                    self.panic.load(Ordering::SeqCst),
+                );
+                timer = std::time::Instant::now();
+            }
+            std::thread::yield_now()
+        }
+        self.executor.shutdown();
+    }
+    pub(crate) fn force_shutdown(&self) {
+        self.status
+            .store(SchedulerStatus::Panic as u8, Ordering::SeqCst);
+        self.executor.force_shutdown();
+    }
 }
 
 pub(crate) fn create_scheduler(
-    sched: SchedulerType,
+    executor: ExecutorType,
     num_pes: usize,
     num_workers: usize,
     panic: Arc<AtomicU8>,
-    my_pe: usize,
-    // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
 ) -> Scheduler {
-    match sched {
-        SchedulerType::WorkStealing => Scheduler::WorkStealing(Arc::new(
-            work_stealing::WorkStealing::new(num_pes, num_workers, panic, my_pe),
-        )), // SchedulerType::NumaWorkStealing => {
-            //     Scheduler::NumaWorkStealing(numa_work_stealing::NumaWorkStealing::new(num_pes))
-            // }
-            // SchedulerType::NumaWorkStealing2 => {
-            //     Scheduler::NumaWorkStealing2(numa_work_stealing2::NumaWorkStealing2::new(num_pes))
-            // }
-    }
+    let am_stall_mark = Arc::new(AtomicUsize::new(0));
+    let status = Arc::new(AtomicU8::new(SchedulerStatus::Active as u8));
+    let executor = match executor {
+        ExecutorType::LamellarWorkStealing => {
+            WorkStealing::new(num_workers, status.clone(), panic.clone()).into()
+        }
+        #[cfg(feature = "tokio-executor")]
+        ExecutorType::Tokio => TokioRt::new(num_workers).into(),
+    };
+
+    let batcher = match std::env::var("LAMELLAR_BATCHER") {
+        Ok(n) => {
+            let n = n.parse::<usize>().unwrap();
+            if n == 1 {
+                BatcherType::Simple(SimpleBatcher::new(num_pes, am_stall_mark.clone()))
+            } else {
+                BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone()))
+            }
+        }
+        Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone())),
+    };
+    Scheduler::new(
+        executor,
+        RegisteredActiveMessages::new(batcher),
+        am_stall_mark,
+        status,
+        panic,
+    )
 }
diff --git a/src/scheduler/numa_work_stealing.rs b/src/scheduler/numa_work_stealing.rs
index 7e94e6ce..c2f5a043 100644
--- a/src/scheduler/numa_work_stealing.rs
+++ b/src/scheduler/numa_work_stealing.rs
@@ -235,10 +235,7 @@ impl AmeSchedulerQueue for NumaWorkStealingInner {
         task.detach();
     }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, future: F) -> F::Output {
         let work_inj = self.work_inj[self
             .local_work_inj
             .get_or(|| AtomicUsize::new(0))
@@ -503,7 +500,7 @@ impl NumaWorkStealingInner {
 
 #[derive(Debug)]
 pub(crate) struct NumaWorkStealing {
-    inner: Arc<AmeScheduler>,
+    inner: &(impl SchedulerQueue + Sync + std::fmt::Debug),
     ame: Arc<ActiveMessageEngineType>,
 }
 impl NumaWorkStealing {
diff --git a/src/scheduler/numa_work_stealing2.rs b/src/scheduler/numa_work_stealing2.rs
index 8f25b182..ec82c3ef 100644
--- a/src/scheduler/numa_work_stealing2.rs
+++ b/src/scheduler/numa_work_stealing2.rs
@@ -431,7 +431,7 @@ thread_local! {
 
 #[derive(Debug)]
 pub(crate) struct NumaWorkStealing2 {
-    inners: Vec<Arc<AmeScheduler>>,
+    inners: Vec<&(impl SchedulerQueue + Sync + std::fmt::Debug)>,
     ames: Vec<Arc<ActiveMessageEngineType>>,
     node_mask: usize,
 }
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
new file mode 100644
index 00000000..f9e14ac1
--- /dev/null
+++ b/src/scheduler/tokio.rs
@@ -0,0 +1,88 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+
+use tokio::runtime::Runtime;
+
+use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::Worker;
+use futures::Future;
+use futures_lite::FutureExt;
+use rand::prelude::*;
+use std::panic;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc; //, Weak};
+use std::thread;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+pub(crate) struct TokioRt {
+    max_num_threads: usize,
+    rt: Runtime,
+}
+
+impl LamellarExecutor for TokioRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl TokioRt {
+    pub(crate) fn new(num_workers: usize) -> TokioRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        TokioRt {
+            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            rt: tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(num_workers + 1)
+                .enable_all()
+                .build()
+                .unwrap(),
+        }
+    }
+}
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index 6fbfa166..e7a06fe4 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -1,10 +1,4 @@
-use crate::active_messaging::{ActiveMessageEngine, ActiveMessageEngineType, Am};
-use crate::lamellae::{Des, Lamellae, SerializedData};
-use crate::scheduler::batching::simple_batcher::SimpleBatcher;
-use crate::scheduler::batching::team_am_batcher::TeamAmBatcher;
-use crate::scheduler::batching::BatcherType;
-use crate::scheduler::registered_active_message::RegisteredActiveMessages;
-use crate::scheduler::{AmeScheduler, AmeSchedulerQueue, SchedulerQueue};
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
 
 use tracing::*;
 
@@ -13,29 +7,14 @@ use core_affinity::CoreId;
 use crossbeam::deque::Worker;
 use futures::Future;
 use futures_lite::FutureExt;
-// use parking_lot::Mutex;
 use rand::prelude::*;
-// use std::collections::BTreeMap;
 use std::panic;
 use std::process;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc; //, Weak};
 use std::thread;
-// use std::time::Instant;
-// use std::time::Instant;
-
-const ACTIVE: u8 = 0;
-const FINISHED: u8 = 1;
-const PANIC: u8 = 2;
 
 static TASK_ID: AtomicUsize = AtomicUsize::new(0);
-
-// static LAST_PRINTED_TASKS: AtomicUsize = AtomicUsize::new(0);
-
-// static OUTSTANDING_REQS:  Mutex<HashMap<usize,usize>> = parking_lot::const_mutex(HashMap::new());
-// lazy_static!{ static ref OUTSTANDING_REQS: Mutex<BTreeMap<usize,usize>> = Mutex::new(BTreeMap::new()); }
-
-
 #[derive(Debug)]
 pub(crate) struct WorkStealingThread {
     imm_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
@@ -43,52 +22,37 @@ pub(crate) struct WorkStealingThread {
     work_stealers: Vec<crossbeam::deque::Stealer<Runnable<usize>>>,
     work_q: Worker<Runnable<usize>>,
     work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
     panic: Arc<AtomicU8>,
 }
 
-
-
 impl WorkStealingThread {
     #[tracing::instrument(skip_all)]
     fn run(
         worker: WorkStealingThread,
         active_cnt: Arc<AtomicUsize>,
-        num_tasks: Arc<AtomicUsize>,
-        _max_tasks: Arc<AtomicUsize>,
+        // num_tasks: Arc<AtomicUsize>,
         id: CoreId,
-        _my_pe: usize,
     ) -> thread::JoinHandle<()> {
         let builder = thread::Builder::new().name("worker_thread".into());
         builder
             .spawn(move || {
                 // println!("TestSchdulerWorker thread running {:?} core: {:?}", std::thread::current().id(), id);
-                // let mut num_task_executed = 0;
                 let _span = trace_span!("WorkStealingThread::run");
                 core_affinity::set_for_current(id);
                 active_cnt.fetch_add(1, Ordering::SeqCst);
                 let mut rng = rand::thread_rng();
                 let t = rand::distributions::Uniform::from(0..worker.work_stealers.len());
                 let mut timer = std::time::Instant::now();
-                // let mut cur_tasks = num_tasks.load(Ordering::SeqCst);
                 while worker.panic.load(Ordering::SeqCst) == 0
-                    && (worker.active.load(Ordering::SeqCst) == ACTIVE
-                        || !(worker.work_q.is_empty()
-                            && worker.work_inj.is_empty()
-                            && worker.imm_inj.is_empty())
-                        || num_tasks.load(Ordering::SeqCst) > 1)
+                    && (
+                        worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+                            || !(worker.work_q.is_empty()
+                                && worker.work_inj.is_empty()
+                                && worker.imm_inj.is_empty())
+                        // || num_tasks.load(Ordering::SeqCst) > 1
+                    )
                 {
-                    // let ot = Instant::now();
-                    // if cur_tasks != num_tasks.load(Ordering::SeqCst){
-                    //     println!(
-                    //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                    //         worker.work_q.len(),
-                    //         worker.work_inj.len(),
-                    //         num_tasks.load(Ordering::SeqCst)
-                    //     );
-                    //     cur_tasks = num_tasks.load(Ordering::SeqCst);
-
-                    // }
                     let omsg = if !worker.imm_inj.is_empty() {
                         worker.imm_inj.steal().success()
                     } else {
@@ -113,50 +77,32 @@ impl WorkStealingThread {
                     };
 
                     if let Some(runnable) = omsg {
-                        if worker.active.load(Ordering::SeqCst) == FINISHED
-                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT 
+                        if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
                         {
                             println!("runnable {:?}", runnable);
                             println!(
-                                "work_q size {:?} work inj size {:?} num_tasks {:?}",
+                                "work_q size {:?} work inj size {:?}", // num_tasks {:?}",
                                 worker.work_q.len(),
                                 worker.work_inj.len(),
-                                num_tasks.load(Ordering::SeqCst)
+                                // num_tasks.load(Ordering::SeqCst)
                             );
                             timer = std::time::Instant::now();
                         }
-                        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-                        // if LAST_PRINTED_TASKS.load(Ordering::Relaxed) != num_tasks.load(Ordering::Relaxed) {
-                        //     LAST_PRINTED_TASKS.store(num_tasks.load(Ordering::Relaxed), Ordering::Relaxed);
-                        //     let work_stealers_lens = worker.work_stealers.iter().map(|x| x.len()).collect::<Vec<_>>();
-                        //     println!("[{:?}] (worker thread) Executing task {:?}, num_tasks: {:?} {:?} {:?} {work_stealers_lens:?} {:?}", std::thread::current().id(), runnable.metadata(),num_tasks.load(Ordering::Relaxed), worker.imm_inj.len(),worker.work_inj.len(), OUTSTANDING_REQS.lock());
-                        // }
                         runnable.run();
                     }
-                    if worker.active.load(Ordering::SeqCst) == FINISHED
-                        && timer.elapsed().as_secs_f64() >  *crate::DEADLOCK_TIMEOUT 
+                    if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
                         && (worker.work_q.len() > 0 || worker.work_inj.len() > 0)
                     {
                         println!(
-                            "work_q size {:?} work inj size {:?} num_tasks {:?}",
+                            "work_q size {:?} work inj size {:?} ", // num_tasks {:?}",
                             worker.work_q.len(),
                             worker.work_inj.len(),
-                            num_tasks.load(Ordering::SeqCst)
+                            // num_tasks.load(Ordering::SeqCst)
                         );
                         timer = std::time::Instant::now();
                     }
-                    // if timer.elapsed().as_secs_f64() > 10.0 {
-                    //     println!(
-                    //         "[{:?}] work_q size {:?} work inj size {:?} num_tasks {:?} {:?} {:?}",
-                    //         std::thread::current().id(),
-                    //         worker.work_q.len(),
-                    //         worker.work_inj.len(),
-                    //         num_tasks.load(Ordering::SeqCst),
-                    //         worker.active.load(Ordering::SeqCst) == FINISHED,
-                    //         OUTSTANDING_REQS.lock()
-                    //     );
-                    //     timer = std::time::Instant::now()
-                    // }
                     std::thread::yield_now();
                 }
                 active_cnt.fetch_sub(1, Ordering::SeqCst);
@@ -167,334 +113,98 @@ impl WorkStealingThread {
 }
 
 #[derive(Debug)]
-pub(crate) struct WorkStealingInner {
+pub(crate) struct WorkStealing {
+    max_num_threads: usize,
     threads: Vec<thread::JoinHandle<()>>,
     imm_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
     work_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
     work_stealers: Vec<crossbeam::deque::Stealer<Runnable<usize>>>,
     work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
     active_cnt: Arc<AtomicUsize>,
-    num_tasks: Arc<AtomicUsize>,
-    max_tasks: Arc<AtomicUsize>,
-    stall_mark: Arc<AtomicUsize>,
     panic: Arc<AtomicU8>,
 }
 
-impl AmeSchedulerQueue for WorkStealingInner {
-    #[tracing::instrument(skip_all)]
-    fn submit_am(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        // println!("[{:?}] submitting_req", std::thread::current().id());
-        // println!("submit req {:?}",self.num_tasks.load(Ordering::Relaxed)+1);
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit am exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),_cur_task);
-            ame.process_msg(am, scheduler, stall_mark, false).await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit am done {:?} {:?} TaskId: {:?} ", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),_cur_task);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked( future, schedule) };
-        // println!("[{:?}] submit am schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-        runnable.schedule();
-        task.detach();
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn submit_am_immediate(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        // println!("submitting_req");
-        // println!("submit req {:?}",self.num_tasks.load(Ordering::Relaxed)+1);
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit am imm exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, scheduler, stall_mark, true).await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit am imm done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future, schedule) };
-        // println!("[{:?}] submit am imm running task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-        runnable.run();
-        task.detach();
-    }
-
-    //this is a serialized request
-    #[tracing::instrument(skip_all)]
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        data: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ) {
-        // let work_inj = self.work_inj.clone();
-        // println!("submit work {:?}", self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let future = move|_cur_task|async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            if let Some(header) = data.deserialize_header() {
-                let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, scheduler).await;
-            } else {
-                data.print();
-                panic!("should i be here?");
-            }
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future, schedule) };
-        // println!("[{:?}] submit work schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn submit_task<F>(&self, future: F)
+impl LamellarExecutor for WorkStealing {
+    fn submit_task<F>(&self, task: F)
     where
-        F: Future<Output = ()> + Send + 'static,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
         trace_span!("submit_task").in_scope(|| {
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move|_cur_task: &_| async move {
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-                max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] submit task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-                future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-                // let mut reqs = OUTSTANDING_REQS.lock();
-                // reqs.remove(cur_task);
-                // println!("[{:?}] submit task done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-            };
             let work_inj = self.work_inj.clone();
-            // let schedule = move |runnable| work_inj.push(runnable);
             let schedule = move |runnable| work_inj.push(runnable);
-            // let (runnable, task) = unsafe { async_task::spawn(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, task) =   Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn(future2, schedule) ;
-            // println!("[{:?}] submit task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
+            let (runnable, task) = Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn(move |_task_id| async move { task.await }, schedule);
 
             runnable.schedule();
             task.detach();
         });
     }
 
-    fn submit_immediate_task<F>(&self, future: F)
+    fn submit_immediate_task<F>(&self, task: F)
     where
-        F: Future<Output = ()> + Send + 'static,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
         trace_span!("submit_task").in_scope(|| {
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move |_cur_task: &_| async move {
-                // println!("exec task {:?}",num_tasks.load(Ordering::Relaxed)+1);
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-               max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] submit imm task exec req {:?} {:?} TaskId: {:?} ", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-                future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-                // let mut reqs = OUTSTANDING_REQS.lock();
-                // reqs.remove(cur_task);
-                // println!("[{:?}] submit imm task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-            };
-            let work_inj = self.work_inj.clone();
-            // let schedule = move |runnable| work_inj.push(runnable);
-            let schedule = move |runnable| work_inj.push(runnable);
-            // let (runnable, task) = unsafe { async_task::spawn(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, task) = Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn(future2, schedule) ;
-            // println!("[{:?}] submit imm task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-            // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-            runnable.run(); //try to run immediately
-            task.detach();
-        });
-    }
-
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move|_cur_task: &_| async move {
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-                max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] submit imm2 task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-                future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-            //     let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-                // println!("[{:?}] submit imm2 task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-            };
             let imm_inj = self.imm_inj.clone();
-            // let schedule = move |runnable| imm_inj.push(runnable);
             let schedule = move |runnable| imm_inj.push(runnable);
-            // let (runnable, task) = unsafe { async_task::spawn(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, task) = Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn(future2, schedule) ;
-            // println!("[{:?}] submit imm2 task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
+            let (runnable, task) = Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn(move |_task_id| async move { task.await }, schedule);
 
             runnable.schedule(); //try to run immediately
             task.detach();
         });
     }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
         trace_span!("block_on").in_scope(|| {
-            // println!(
-            //     "[{:?}] work stealing block on -- num tasks {:?} max tasks {:?}  tasks executed {:?}",
-            //     std::thread::current().id(),
-            //     self.num_tasks.load(Ordering::Relaxed),
-            //     self.max_tasks.load(Ordering::Relaxed),
-            //     0
-            // );
-            let num_tasks = self.num_tasks.clone();
-            let max_tasks = self.max_tasks.clone();
-            let future2 = move|_cur_task| async move { 
-                num_tasks.fetch_add(1, Ordering::Relaxed);
-                 max_tasks.fetch_add(1, Ordering::Relaxed);
-                // println!("[{:?}] block on task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-                let res = future.await;
-                num_tasks.fetch_sub(1, Ordering::Relaxed);
-                // let mut reqs = OUTSTANDING_REQS.lock();
-                // reqs.remove(cur_task);
-                // println!("[{:?}] block on task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-                res
-            };
             let work_inj = self.work_inj.clone();
-            // let schedule = move |runnable| work_inj.push(runnable);
             let schedule = move |runnable| work_inj.push(runnable);
-            
-            // let (runnable, mut task) = unsafe { async_task::spawn(future, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-            let (runnable, mut task) = unsafe {  Builder::new().metadata(TASK_ID.fetch_add(1, Ordering::Relaxed)).spawn_unchecked(future2, schedule) };
+            let (runnable, mut task) = unsafe {
+                Builder::new()
+                    .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                    .spawn_unchecked(move |_task_id| async move { task.await }, schedule)
+            };
             let waker = runnable.waker();
-            // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
             runnable.run(); //try to run immediately
-            // let mut s = std::time::Instant::now();
-            // let mut cnt = 0;
             while !task.is_finished() {
-                self.exec_task();
-                // if s.elapsed().as_secs() > 10 {
-                //     println!(
-                //         "[{:?}] work stealing block on timeout -- num tasks {:?} max tasks {:?}  tasks executed {:?} task id{:?} {:?}",
-                //         std::thread::current().id(),
-                //         self.num_tasks.load(Ordering::Relaxed),
-                //         self.max_tasks.load(Ordering::Relaxed),
-                //         cnt,
-                //         task.metadata(),
-                //         OUTSTANDING_REQS.lock(),
-                //     );
-                //     s = std::time::Instant::now();
-                //     break;
-                // }
-                // cnt += 1;
-                // std::thread::yield_now();
+                self.exec_task(); //try to execute another task while this one is not ready
             }
             let cx = &mut async_std::task::Context::from_waker(&waker);
             if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-                // println!(
-                //     "[{:?}] work stealing block on done -- num tasks {:?} max tasks {:?}  tasks executed {:?} task id{:?}",
-                //     std::thread::current().id(),
-                //     self.num_tasks.load(Ordering::Relaxed),
-                //     self.max_tasks.load(Ordering::Relaxed),
-                //     cnt,
-                //     task.metadata()
-                // );
                 output
             } else {
                 println!(
-                    "[{:?}] work stealing block on failed -- num tasks {:?} max tasks {:?}   task id{:?}",
+                    "[{:?}] work stealing block on failed --  task id{:?}",
                     std::thread::current().id(),
-                    self.num_tasks.load(Ordering::Relaxed),
-                    self.max_tasks.load(Ordering::Relaxed),
                     task.metadata()
                 );
                 panic!("task not ready");
             }
-            
         })
     }
 
     #[tracing::instrument(skip_all)]
     fn shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(FINISHED, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
-        while self.panic.load(Ordering::SeqCst) == 0
-            && (self.active_cnt.load(Ordering::Relaxed) > 0 //num active threads
-            || self.num_tasks.load(Ordering::Relaxed) > 2)
+        while self.panic.load(Ordering::SeqCst) == 0 && self.active_cnt.load(Ordering::Relaxed) > 0
         {
-            //this should be the recvtask, and alloc_task
+            //num active threads
+            self.exec_task();
             std::thread::yield_now()
         }
-        // println!(
-        //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
-        //     self.active_cnt.load(Ordering::Relaxed),
-        //     self.active_cnt.load(Ordering::Relaxed)
-        // );
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn shutdown_threads(&self) {
-        self.active.store(FINISHED, Ordering::SeqCst);
     }
 
     #[tracing::instrument(skip_all)]
     fn force_shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(PANIC, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
+        // println!("work stealing shuting down {:?}", self.status());
+
+        // println!("work stealing shuting down {:?}",self.status());
         let my_id = std::thread::current().id();
         if self.threads.iter().any(|e| e.thread().id() == my_id) {
-            // while self.active_cnt.load(Ordering::Relaxed) > 1 {//num active threads -- wait for all but myself
-            //     std::thread::yield_now()
-            // }
             self.active_cnt.fetch_sub(1, Ordering::SeqCst); // I paniced so I wont actually decrement
         } else {
             while self.active_cnt.load(Ordering::Relaxed) > 0 {
@@ -505,7 +215,7 @@ impl AmeSchedulerQueue for WorkStealingInner {
         }
         // println!(
         //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
+        //     self.status(),
         //     self.active_cnt.load(Ordering::Relaxed),
         //     self.active_cnt.load(Ordering::Relaxed)
         // );
@@ -529,141 +239,46 @@ impl AmeSchedulerQueue for WorkStealingInner {
             } else {
                 self.work_stealers[t.sample(&mut rng)].steal().success()
             }
-           
         };
         if let Some(runnable) = ret {
-            // if LAST_PRINTED_TASKS.load(Ordering::Relaxed) != self.num_tasks.load(Ordering::Relaxed) {
-            //     LAST_PRINTED_TASKS.store(self.num_tasks.load(Ordering::Relaxed), Ordering::Relaxed);
-            //     let work_stealers_lens = self.work_stealers.iter().map(|x| x.len()).collect::<Vec<_>>();
-            //     // println!("[{:?}] (exec_task) Executing task {:?}, num_tasks: {:?} {:?} {:?} {work_stealers_lens:?} {:?}", std::thread::current().id(), runnable.metadata(),self.num_tasks.load(Ordering::Relaxed), self.imm_inj.len(),self.work_inj.len(), OUTSTANDING_REQS.lock());
-            // }
-            // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
             runnable.run();
         }
     }
 
-    #[tracing::instrument(skip_all)]
-    fn active(&self) -> bool {
-        // println!("sched active {:?} {:?}",self.active.load(Ordering::SeqCst) , self.num_tasks.load(Ordering::SeqCst));
-        self.active.load(Ordering::SeqCst) == ACTIVE || self.num_tasks.load(Ordering::SeqCst) > 3
-    }
-}
-
-impl SchedulerQueue for Arc<WorkStealing> {
-    fn submit_am(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner.submit_am(self.clone(), self.ame.clone(), am);
-    }
-
-    fn submit_am_immediate(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner.submit_am_immediate(self.clone(), self.ame.clone(), am);
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
     }
 
-    // fn submit_return(&self, src, pe)
-
-    fn submit_work(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
-        self.inner
-            .submit_work(self.clone(), self.ame.clone(), data, lamellae);
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn submit_immediate_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        self.inner.submit_immediate_task(future);
-    }
-
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send+  'static,
-    {
-        self.inner.submit_immediate_task2(future);
-    }
-
-    fn exec_task(&self) {
-        self.inner.exec_task();
-        std::thread::yield_now();
-    }
-
-    fn submit_task_node<F>(&self, future: F, _node: usize)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        self.inner.block_on(future)
-    }
-
-    fn shutdown(&self) {
-        self.inner.shutdown();
-    }
-
-    fn shutdown_threads(&self) {
-        self.inner.shutdown_threads();
-    }
-
-    fn force_shutdown(&self) {
-        self.inner.force_shutdown();
-    }
-    fn active(&self) -> bool {
-        self.inner.active()
-    }
     fn num_workers(&self) -> usize {
         self.max_num_threads
     }
 }
 
-impl WorkStealingInner {
-    #[tracing::instrument(skip_all)]
+impl WorkStealing {
     pub(crate) fn new(
-        stall_mark: Arc<AtomicUsize>,
         num_workers: usize,
+        status: Arc<AtomicU8>,
         panic: Arc<AtomicU8>,
-        my_pe: usize,
-    ) -> WorkStealingInner {
+    ) -> WorkStealing {
         // println!("new work stealing queue");
-
-        let mut sched = WorkStealingInner {
+        let mut ws = WorkStealing {
+            max_num_threads: num_workers,
             threads: Vec::new(),
             imm_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_stealers: Vec::new(),
             work_flag: Arc::new(AtomicU8::new(0)),
-            active: Arc::new(AtomicU8::new(ACTIVE)),
+            status: status,
             active_cnt: Arc::new(AtomicUsize::new(0)),
-            num_tasks: Arc::new(AtomicUsize::new(0)),
-            max_tasks: Arc::new(AtomicUsize::new(0)),
-            stall_mark: stall_mark,
             panic: panic,
         };
-        sched.init(num_workers, my_pe);
-        sched
+        ws.init();
+        ws
     }
-
     #[tracing::instrument(skip_all)]
-    fn init(&mut self, num_workers: usize, my_pe: usize) {
-        let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<Runnable<usize>>> =
-            vec![];
-        for _i in 0..num_workers {
+    fn init(&mut self) {
+        let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<Runnable<usize>>> = vec![];
+        for _i in 0..self.max_num_threads {
             let work_worker: crossbeam::deque::Worker<Runnable<usize>> =
                 crossbeam::deque::Worker::new_fifo();
             self.work_stealers.push(work_worker.stealer());
@@ -683,7 +298,7 @@ impl WorkStealingInner {
             }
         };
         // println!("core_ids: {:?}",core_ids);
-        for i in 0..num_workers {
+        for i in 0..self.max_num_threads {
             let work_worker = work_workers.pop().unwrap();
             let worker = WorkStealingThread {
                 imm_inj: self.imm_inj.clone(),
@@ -691,17 +306,14 @@ impl WorkStealingInner {
                 work_stealers: self.work_stealers.clone(),
                 work_q: work_worker,
                 work_flag: self.work_flag.clone(),
-                active: self.active.clone(),
+                status: self.status.clone(),
                 panic: self.panic.clone(),
-                // num_tasks: self.num_tasks.clone(),
             };
             self.threads.push(WorkStealingThread::run(
                 worker,
                 self.active_cnt.clone(),
-                self.num_tasks.clone(),
-                self.max_tasks.clone(),
+                // self.num_tasks.clone(),
                 core_ids[i % core_ids.len()],
-                my_pe,
             ));
         }
         while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
@@ -710,54 +322,7 @@ impl WorkStealingInner {
     }
 }
 
-#[derive(Debug)]
-pub(crate) struct WorkStealing {
-    inner: Arc<AmeScheduler>,
-    ame: Arc<ActiveMessageEngineType>,
-    max_num_threads: usize, //including the main thread
-}
-impl WorkStealing {
-    #[tracing::instrument(skip_all)]
-    pub(crate) fn new(
-        num_pes: usize,
-        num_workers: usize,
-        panic: Arc<AtomicU8>,
-        my_pe: usize,
-        // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
-    ) -> WorkStealing {
-        // println!("new work stealing queue");
-        let stall_mark = Arc::new(AtomicUsize::new(0));
-        let inner = Arc::new(AmeScheduler::WorkStealingInner(WorkStealingInner::new(
-            stall_mark.clone(),
-            num_workers,
-            panic.clone(),
-            my_pe,
-        )));
-
-        let batcher = match std::env::var("LAMELLAR_BATCHER") {
-            Ok(n) => {
-                let n = n.parse::<usize>().unwrap();
-                if n == 1 {
-                    BatcherType::Simple(SimpleBatcher::new(num_pes, stall_mark.clone()))
-                } else {
-                    BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone()))
-                }
-            }
-            Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone())),
-        };
-
-        let sched = WorkStealing {
-            inner: inner.clone(),
-            ame: Arc::new(ActiveMessageEngineType::RegisteredActiveMessages(
-                Arc::new(RegisteredActiveMessages::new(batcher)),
-            )),
-            max_num_threads: num_workers,
-        };
-        sched
-    }
-}
-
-impl Drop for WorkStealingInner {
+impl Drop for WorkStealing {
     //when is this called with respect to world?
     #[tracing::instrument(skip_all)]
     fn drop(&mut self) {
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 42f3e53a..5bf47967 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -438,7 +438,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
-            array.batch_add(&world.block_on(input_array.read_local_data()),1);
+            array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -447,7 +447,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
-            array.batch_add(&world.block_on(input_array.read_local_data()),1);
+            array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 4ab2b23e..6c91e0fc 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -539,7 +539,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&world.block_on(input_array.read_local_data()),1));
+            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
             check_results!($array,array,num_pes,reqs,"&LocalLockArray<T>");
 
             // GlobalLockArray<T>------------------------------
@@ -549,7 +549,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&world.block_on(input_array.read_local_data()),1));
+            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
             check_results!($array,array,num_pes,reqs,"&GlobalLockArray<T>");
        }
     }

From d11c04cec80c68a0fe04de0086644407a8c98d35 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Thu, 8 Feb 2024 10:48:02 -0800
Subject: [PATCH 005/116] refactoring internal  block_on calls to async calls

---
 impl/src/array_ops.rs                         |  18 +-
 impl/src/array_reduce.rs                      |   2 +-
 src/array.rs                                  |  68 +++++++
 src/array/atomic.rs                           |  22 ++-
 src/array/generic_atomic.rs                   |  30 ++-
 src/array/generic_atomic/iteration.rs         |  12 +-
 src/array/global_lock_atomic.rs               |  24 ++-
 src/array/global_lock_atomic/iteration.rs     |  12 +-
 src/array/iterator/distributed_iterator.rs    | 182 +++++++++---------
 .../distributed_iterator/consumer/collect.rs  |  35 ++--
 src/array/iterator/local_iterator.rs          |  10 +-
 .../local_iterator/consumer/collect.rs        |  24 ++-
 src/array/local_lock_atomic.rs                |  24 ++-
 src/array/local_lock_atomic/iteration.rs      |  12 +-
 src/array/local_only.rs                       |  11 ++
 src/array/native_atomic.rs                    |  26 ++-
 src/array/native_atomic/iteration.rs          |  12 +-
 src/array/read_only.rs                        |  22 ++-
 src/array/read_only/iteration.rs              |  12 +-
 src/array/unsafe.rs                           | 168 +++++++++++++++-
 src/array/unsafe/iteration/distributed.rs     |  10 +-
 src/array/unsafe/iteration/local.rs           |   6 +-
 src/lamellar_task_group.rs                    |  21 ++
 src/scheduler.rs                              |  10 +-
 src/scheduler/tokio.rs                        |  88 ---------
 25 files changed, 586 insertions(+), 275 deletions(-)
 delete mode 100644 src/scheduler/tokio.rs

diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index 56a67524..96bd93ec 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -897,7 +897,7 @@ fn create_buf_ops(
             #[allow(non_snake_case)]
             fn #dist_multi_val_multi_idx_am_buf_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>, index_size: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                     Arc::new(#multi_val_multi_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         idx_vals: idx_vals,
                         index_size: index_size,
@@ -964,7 +964,7 @@ fn create_buf_ops(
                     let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                     let val = val_slice[0];
                     Arc::new(#single_val_multi_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         val: val,
                         indices: indicies,
@@ -1001,7 +1001,7 @@ fn create_buf_ops(
             #[allow(non_snake_case)]
             fn #dist_multi_val_single_idx_am_buf_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                     Arc::new(#multi_val_single_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         vals: vals,
                         index: index,
@@ -1070,7 +1070,7 @@ fn create_buf_ops(
                 #[allow(non_snake_case)]
                 fn #dist_multi_val_multi_idx_am_buf_result_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>, index_size: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                         Arc::new(#multi_val_multi_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             idx_vals: idx_vals,
                             index_size: index_size,
@@ -1139,7 +1139,7 @@ fn create_buf_ops(
                         let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                         let val = val_slice[0];
                         Arc::new(#single_val_multi_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             val: val,
                             indices: indicies,
@@ -1178,7 +1178,7 @@ fn create_buf_ops(
                 #[allow(non_snake_case)]
                 fn #dist_multi_val_single_idx_am_buf_result_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                         Arc::new(#multi_val_single_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             vals: vals,
                             index: index,
@@ -1251,7 +1251,7 @@ fn create_buf_ops(
         #[allow(non_snake_case)]
         fn #dist_multi_val_multi_idx_am_buf_fetch_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>,index_usize: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                 Arc::new(#multi_val_multi_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     idx_vals: idx_vals,
                     index_size: index_usize,
@@ -1320,7 +1320,7 @@ fn create_buf_ops(
                 let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                 let val = val_slice[0];
                 Arc::new(#single_val_multi_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     val: val,
                     indices: indicies,
@@ -1363,7 +1363,7 @@ fn create_buf_ops(
         #[allow(non_snake_case)]
         fn #dist_multi_val_single_idx_am_buf_fetch_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                 Arc::new(#multi_val_single_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     vals: vals,
                     index: index,
diff --git a/impl/src/array_reduce.rs b/impl/src/array_reduce.rs
index d059f96f..ee7629e6 100644
--- a/impl/src/array_reduce.rs
+++ b/impl/src/array_reduce.rs
@@ -49,7 +49,7 @@ fn create_reduction(
 
         gen_match_stmts.extend(quote!{
             #lamellar::array::LamellarByteArray::#array_type(inner) => std::sync::Arc::new(#reduction_name{
-                data: unsafe {inner.clone().into()} , start_pe: 0, end_pe: num_pes-1}),
+                data: unsafe {Into::into(inner.clone())} , start_pe: 0, end_pe: num_pes-1}),
         });
 
         let iter_chain = if array_type == "AtomicArray"
diff --git a/src/array.rs b/src/array.rs
index 5fe5eed4..386a2aaa 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -503,12 +503,63 @@ impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
     }
 }
 
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub(crate) trait AsyncInto<T>: Sized {
+    async fn async_into(self) -> T;
+}
+
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub(crate) trait AsyncFrom<T>: Sized {
+    async fn async_from(val: T) -> Self;
+}
+
+// AsyncFrom implies AsyncInto
+#[async_trait]
+impl<T, U> AsyncInto<U> for T
+where
+    T: Send,
+    U: AsyncFrom<T>,
+{
+    /// Calls `U::from(self).await`.
+    ///
+    /// That is, this conversion is whatever the implementation of
+    /// <code>[AsyncFrom]&lt;T&gt; for U</code> chooses to do.
+    #[inline]
+    async fn async_into(self) -> U {
+        U::async_from(self).await
+    }
+}
+
+// AsyncFrom (and thus Into) is reflexive
+// #[async_trait]
+// impl<T> AsyncFrom<T> for T
+// where
+//     T: Send,
+// {
+//     /// Returns the argument unchanged.
+//     #[inline(always)]
+//     async fn async_from(t: T) -> T {
+//         t
+//     }
+// }
+
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamFrom<T: ?Sized> {
     /// Converts to this type from the input type
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
 }
 
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> {
+    async fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
+}
+
 /// Provides the same abstraction as the `TryFrom` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamTryFrom<T: ?Sized> {
     /// Trys to convert to this type from the input type
@@ -522,6 +573,13 @@ pub trait TeamInto<T: ?Sized> {
     fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
 }
 
+/// Provides the same abstraction as the `Into` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated to be used within an async context
+#[async_trait]
+pub trait AsyncTeamInto<T: ?Sized> {
+    /// converts this type into the (usually inferred) input type
+    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
+}
+
 /// Provides the same abstraction as the `TryInto` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 
 pub trait TeamTryInto<T>: Sized {
@@ -538,6 +596,16 @@ where
     }
 }
 
+#[async_trait]
+impl<T: Send, U> AsyncTeamInto<U> for T
+where
+    U: AsyncTeamFrom<T>,
+{
+    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> U {
+        <U as AsyncTeamFrom<T>>::team_from(self, team).await
+    }
+}
+
 impl<T, U> TeamTryInto<U> for T
 where
     U: TeamTryFrom<T>,
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 896876b0..3d1f863f 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1079,11 +1079,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("Converting from UnsafeArray to AtomicArray");
@@ -1095,6 +1103,18 @@ impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist + 'static> AsyncFrom<UnsafeArray<T>> for AtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("Converting from UnsafeArray to AtomicArray");
+        if NATIVE_ATOMICS.contains(&TypeId::of::<T>()) {
+            NativeAtomicArray::async_from(array).await.into()
+        } else {
+            GenericAtomicArray::async_from(array).await.into()
+        }
+    }
+}
+
 // impl<T: Dist + 'static> From<LocalOnlyArray<T>> for AtomicArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("Converting from LocalOnlyArray to AtomicArray");
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 382059a4..e051719b 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -580,11 +580,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("generic from unsafe array");
@@ -602,6 +610,26 @@ impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GenericAtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("generic from unsafe array");
+        array
+            .await_on_outstanding(DarcMode::GenericAtomicArray)
+            .await;
+        let mut vec = vec![];
+        for _i in 0..array.num_elems_local() {
+            vec.push(Mutex::new(()));
+        }
+        let locks = Darc::new(array.team_rt(), vec).unwrap();
+
+        GenericAtomicArray {
+            locks: locks,
+            array: array,
+        }
+    }
+}
+
 impl<T: Dist> From<GenericAtomicArray<T>> for GenericAtomicByteArray {
     fn from(array: GenericAtomicArray<T>) -> Self {
         GenericAtomicByteArray {
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index 6f5bfbe1..73980420 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -297,7 +297,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -311,7 +311,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -324,7 +324,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -339,7 +339,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -485,7 +485,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -499,7 +499,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 6b9ff9ef..48bf357b 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -664,11 +664,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T>
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("GlobalLock from unsafe");
@@ -682,6 +690,20 @@ impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("GlobalLock from unsafe");
+        array.await_on_outstanding(DarcMode::GlobalLockArray).await;
+        let lock = GlobalRwDarc::new(array.team_rt(), ()).unwrap();
+
+        GlobalLockArray {
+            lock: lock,
+            array: array,
+        }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for GlobalLockArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("GlobalLock from localonly");
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 70c4db61..37d5d25a 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -453,7 +453,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -467,7 +467,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -480,7 +480,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -495,7 +495,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -641,7 +641,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -655,7 +655,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 4149510e..d41fdf1f 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -39,8 +39,8 @@ pub(crate) use consumer::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::{IterRequest, Schedule};
 use crate::array::{
-    operations::ArrayOps, AtomicArray, Distribution, GenericAtomicArray, LamellarArray,
-    LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
+    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray,
+    LamellarArray, LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
 };
 use crate::lamellar_request::LamellarRequest;
 use crate::memregion::Dist;
@@ -55,10 +55,10 @@ use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 
-#[doc(hidden)]
-pub struct DistIterForEachHandle {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
-}
+// #[doc(hidden)]
+// pub struct DistIterForEachHandle {
+//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
+// }
 
 // impl Drop for DistIterForEachHandle {
 //     fn drop(&mut self) {
@@ -66,87 +66,87 @@ pub struct DistIterForEachHandle {
 //     }
 // }
 
-#[doc(hidden)]
-#[async_trait]
-impl IterRequest for DistIterForEachHandle {
-    type Output = ();
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.into_future().await;
-        }
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.get();
-        }
-    }
-}
-
-#[doc(hidden)]
-pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
-    pub(crate) distribution: Distribution,
-    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    pub(crate) _phantom: PhantomData<A>,
-}
+// #[doc(hidden)]
+// #[async_trait]
+// impl IterRequest for DistIterForEachHandle {
+//     type Output = ();
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         for req in self.reqs.drain(..) {
+//             req.into_future().await;
+//         }
+//     }
+//     fn wait(mut self: Box<Self>) -> Self::Output {
+//         for req in self.reqs.drain(..) {
+//             req.get();
+//         }
+//     }
+// }
 
-impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> DistIterCollectHandle<T, A> {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
-        self.team.tasking_barrier();
-        let local_sizes =
-            UnsafeArray::<usize>::new(self.team.clone(), self.team.num_pes, Distribution::Block);
-        unsafe {
-            local_sizes.local_as_mut_slice()[0] = local_vals.len();
-        }
-        local_sizes.barrier();
-        // local_sizes.print();
-        let mut size = 0;
-        let mut my_start = 0;
-        let my_pe = self.team.team_pe.expect("pe not part of team");
-        // local_sizes.print();
-        unsafe {
-            local_sizes
-                .onesided_iter()
-                .into_iter()
-                .enumerate()
-                .for_each(|(i, local_size)| {
-                    size += local_size;
-                    if i < my_pe {
-                        my_start += local_size;
-                    }
-                });
-        }
-        // println!("my_start {} size {}", my_start, size);
-        let array = UnsafeArray::<T>::new(self.team.clone(), size, self.distribution); //implcit barrier
+// #[doc(hidden)]
+// pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
+//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
+//     pub(crate) distribution: Distribution,
+//     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+//     pub(crate) _phantom: PhantomData<A>,
+// }
 
-        // safe because only a single reference to array on each PE
-        // we calculate my_start so that each pes local vals are guaranteed to not overwrite another pes values.
-        let _ = unsafe { array.put(my_start, local_vals) };
-        array.into()
-    }
-}
-#[async_trait]
-impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> IterRequest
-    for DistIterCollectHandle<T, A>
-{
-    type Output = A;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        let mut local_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.into_future().await;
-            local_vals.extend(v);
-        }
-        self.create_array(&local_vals)
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        let mut local_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.get();
-            local_vals.extend(v);
-        }
-        self.create_array(&local_vals)
-    }
-}
+// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> DistIterCollectHandle<T, A> {
+//     fn create_array(&self, local_vals: &Vec<T>) -> A {
+//         self.team.tasking_barrier();
+//         let local_sizes =
+//             UnsafeArray::<usize>::new(self.team.clone(), self.team.num_pes, Distribution::Block);
+//         unsafe {
+//             local_sizes.local_as_mut_slice()[0] = local_vals.len();
+//         }
+//         local_sizes.barrier();
+//         // local_sizes.print();
+//         let mut size = 0;
+//         let mut my_start = 0;
+//         let my_pe = self.team.team_pe.expect("pe not part of team");
+//         // local_sizes.print();
+//         unsafe {
+//             local_sizes
+//                 .onesided_iter()
+//                 .into_iter()
+//                 .enumerate()
+//                 .for_each(|(i, local_size)| {
+//                     size += local_size;
+//                     if i < my_pe {
+//                         my_start += local_size;
+//                     }
+//                 });
+//         }
+//         // println!("my_start {} size {}", my_start, size);
+//         let array = UnsafeArray::<T>::new(self.team.clone(), size, self.distribution); //implcit barrier
+
+//         // safe because only a single reference to array on each PE
+//         // we calculate my_start so that each pes local vals are guaranteed to not overwrite another pes values.
+//         unsafe { array.put(my_start, local_vals) };
+//         array.into()
+//     }
+// }
+// #[async_trait]
+// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> IterRequest
+//     for DistIterCollectHandle<T, A>
+// {
+//     type Output = A;
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         let mut local_vals = vec![];
+//         for req in self.reqs.drain(0..) {
+//             let v = req.into_future().await;
+//             local_vals.extend(v);
+//         }
+//         self.create_array(&local_vals)
+//     }
+//     fn wait(mut self: Box<Self>) -> Self::Output {
+//         let mut local_vals = vec![];
+//         for req in self.reqs.drain(0..) {
+//             let v = req.get();
+//             local_vals.extend(v);
+//         }
+//         self.create_array(&local_vals)
+//     }
+// }
 
 #[doc(hidden)]
 #[enum_dispatch]
@@ -212,7 +212,7 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -223,7 +223,7 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_async<I, A, B>(
         &self,
@@ -234,7 +234,7 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_async_with_schedule<I, A, B>(
         &self,
@@ -246,7 +246,7 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
 
     fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
     where
@@ -670,7 +670,7 @@ pub trait DistributedIterator: SyncSend + Clone + 'static {
     where
         // &'static Self: DistributedIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect(self, d)
     }
@@ -716,7 +716,7 @@ pub trait DistributedIterator: SyncSend + Clone + 'static {
         // &'static Self: DistributedIterator + 'static,
         T: Dist + ArrayOps,
         Self::Item: Future<Output = T> + Send + 'static,
-        A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect_async(self, d)
     }
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index a9ec30b4..31486893 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
 use crate::array::iterator::IterRequest;
 use crate::array::operations::ArrayOps;
-use crate::array::{Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
@@ -25,7 +25,7 @@ impl<I, A> IterConsumer for Collect<I, A>
 where
     I: DistributedIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
@@ -75,7 +75,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, B)>;
     type Output = A;
@@ -118,7 +118,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     fn clone(&self) -> Self {
         CollectAsync {
@@ -132,7 +132,7 @@ where
 #[doc(hidden)]
 pub struct DistIterCollectHandle<
     T: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
     pub(crate) distribution: Distribution,
@@ -140,16 +140,23 @@ pub struct DistIterCollectHandle<
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend>
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
     DistIterCollectHandle<T, A>
 {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
+    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
         let input = (local_vals, self.distribution);
-        input.team_into(&self.team)
+        let array: A = AsyncTeamInto::team_into(input, &self.team).await;
+        array
+    }
+
+    fn create_array(&self, local_vals: Vec<T>) -> A {
+        let input = (local_vals, self.distribution);
+        let array: A = TeamInto::team_into(input, &self.team);
+        array
     }
 }
 #[async_trait]
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend> IterRequest
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
     for DistIterCollectHandle<T, A>
 {
     type Output = A;
@@ -161,7 +168,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect::<Vec<_>>();
-        self.create_array(&local_vals)
+        self.async_create_array(local_vals).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -172,7 +179,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.create_array(local_vals)
     }
 }
 
@@ -187,7 +194,7 @@ impl<I, A> LamellarAm for CollectAm<I, A>
 where
     I: DistributedIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<I::Item> {
         let iter = self.schedule.init_iter(self.iter.clone());
@@ -201,7 +208,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     pub(crate) iter: CollectAsync<I, A, B>,
     pub(crate) schedule: IterSchedule,
@@ -213,7 +220,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<(usize, B)> {
         let mut iter = self.schedule.init_iter(self.iter.clone());
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 7b177843..ff857846 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -35,7 +35,7 @@ use zip::*;
 pub(crate) use consumer::*;
 
 use crate::array::iterator::Schedule;
-use crate::array::{operations::ArrayOps, AtomicArray, Distribution, LamellarArray, TeamFrom};
+use crate::array::{operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, LamellarArray};
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
@@ -125,7 +125,7 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -136,7 +136,7 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     // fn collect_async<I, A, B>(
     //     &self,
@@ -579,7 +579,7 @@ pub trait LocalIterator: SyncSend + Clone + 'static {
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect(self, d)
     }
@@ -607,7 +607,7 @@ pub trait LocalIterator: SyncSend + Clone + 'static {
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect_with_schedule(sched, self, d)
     }
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 0aabcade..df16f948 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
 use crate::array::iterator::IterRequest;
 use crate::array::operations::ArrayOps;
-use crate::array::{Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
@@ -24,7 +24,7 @@ impl<I, A> IterConsumer for Collect<I, A>
 where
     I: LocalIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
@@ -114,7 +114,7 @@ where
 #[doc(hidden)]
 pub struct LocalIterCollectHandle<
     T: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
     pub(crate) distribution: Distribution,
@@ -122,16 +122,20 @@ pub struct LocalIterCollectHandle<
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend>
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
     LocalIterCollectHandle<T, A>
 {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
+    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
         let input = (local_vals, self.distribution);
-        input.team_into(&self.team)
+        AsyncTeamInto::team_into(input, &self.team).await
+    }
+    fn create_array(&self, local_vals: Vec<T>) -> A {
+        let input = (local_vals, self.distribution);
+        TeamInto::team_into(input, &self.team)
     }
 }
 #[async_trait]
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend> IterRequest
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
     for LocalIterCollectHandle<T, A>
 {
     type Output = A;
@@ -143,7 +147,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.async_create_array(local_vals).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -154,7 +158,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.create_array(local_vals)
     }
 }
 
@@ -169,7 +173,7 @@ impl<I, A> LamellarAm for CollectAm<I, A>
 where
     I: LocalIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<I::Item> {
         let iter = self.schedule.init_iter(self.iter.clone());
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index d897e922..f6ce6559 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -578,11 +578,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T>
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("locallock from unsafe");
@@ -596,6 +604,20 @@ impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("locallock from unsafe");
+        array.await_on_outstanding(DarcMode::LocalLockArray).await;
+        let lock = LocalRwDarc::new(array.team_rt(), ()).unwrap();
+
+        LocalLockArray {
+            lock: lock,
+            array: array,
+        }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for LocalLockArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("locallock from localonly");
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index a1d4479c..53e26ed6 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -457,7 +457,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -471,7 +471,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -484,7 +484,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -499,7 +499,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -645,7 +645,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -659,7 +659,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/local_only.rs b/src/array/local_only.rs
index 5b931ad2..258d7856 100644
--- a/src/array/local_only.rs
+++ b/src/array/local_only.rs
@@ -99,6 +99,17 @@ impl<T: Dist> From<UnsafeArray<T>> for LocalOnlyArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalOnlyArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        array.await_on_outstanding(DarcMode::LocalOnlyArray).await;
+        LocalOnlyArray {
+            array: array,
+            _unsync: PhantomData,
+        }
+    }
+}
+
 impl<T: Dist> From<ReadOnlyArray<T>> for LocalOnlyArray<T> {
     fn from(array: ReadOnlyArray<T>) -> Self {
         unsafe { array.into_inner().into() }
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 590f9b48..9fc0e785 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -995,11 +995,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 #[doc(hidden)]
 impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
@@ -1013,6 +1021,22 @@ impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     }
 }
 
+#[doc(hidden)]
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for NativeAtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("native from unsafe");
+        array
+            .await_on_outstanding(DarcMode::NativeAtomicArray)
+            .await;
+
+        NativeAtomicArray {
+            array: array,
+            orig_t: NativeAtomicType::from::<T>(),
+        }
+    }
+}
+
 #[doc(hidden)]
 impl<T: Dist> From<NativeAtomicArray<T>> for NativeAtomicByteArray {
     fn from(array: NativeAtomicArray<T>) -> Self {
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index b1775322..56caafb5 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -273,7 +273,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -287,7 +287,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -300,7 +300,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -315,7 +315,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -461,7 +461,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -475,7 +475,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 942c2fad..b1fc8be5 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -385,14 +385,22 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for ReadOnlyArray<T> {
     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
@@ -406,6 +414,16 @@ impl<T: Dist> From<UnsafeArray<T>> for ReadOnlyArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("readonly from UnsafeArray");
+        array.await_on_outstanding(DarcMode::ReadOnlyArray).await;
+
+        ReadOnlyArray { array: array }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for ReadOnlyArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("readonly from LocalOnlyArray");
diff --git a/src/array/read_only/iteration.rs b/src/array/read_only/iteration.rs
index e6f68976..af59f35a 100644
--- a/src/array/read_only/iteration.rs
+++ b/src/array/read_only/iteration.rs
@@ -116,7 +116,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -130,7 +130,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -143,7 +143,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -158,7 +158,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -304,7 +304,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -318,7 +318,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index bb455826..d818e467 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -202,6 +202,76 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         // println!("after buffered ops");
         // array.inner.data.print();
     }
+
+    async fn async_new<U: Into<IntoLamellarTeam>>(
+        team: U,
+        array_size: usize,
+        distribution: Distribution,
+    ) -> UnsafeArray<T> {
+        let team = team.into().team.clone();
+        team.async_barrier().await;
+        let task_group = LamellarTaskGroup::new(team.clone());
+        let my_pe = team.team_pe_id().unwrap();
+        let num_pes = team.num_pes();
+        let full_array_size = std::cmp::max(array_size, num_pes);
+
+        let elem_per_pe = full_array_size as f64 / num_pes as f64;
+        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
+                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
+        let rmr = MemoryRegion::new(
+            per_pe_size * std::mem::size_of::<T>(),
+            team.lamellae.clone(),
+            AllocationType::Global,
+        );
+        unsafe {
+            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
+                *elem = 0;
+            }
+        }
+
+        let data = Darc::try_new_with_drop(
+            team.clone(),
+            UnsafeArrayData {
+                mem_region: rmr,
+                array_counters: Arc::new(AMCounters::new()),
+                team: team.clone(),
+                task_group: Arc::new(task_group),
+                my_pe: my_pe,
+                num_pes: num_pes,
+                req_cnt: Arc::new(AtomicUsize::new(0)),
+            },
+            crate::darc::DarcMode::UnsafeArray,
+            None,
+        )
+        .expect("trying to create array on non team member");
+        let array = UnsafeArray {
+            inner: UnsafeArrayInner {
+                data: data,
+                distribution: distribution.clone(),
+                // wait: wait,
+                orig_elem_per_pe: elem_per_pe,
+                elem_size: std::mem::size_of::<T>(),
+                offset: 0,             //relative to size of T
+                size: full_array_size, //relative to size of T
+            },
+            phantom: PhantomData,
+        };
+        // println!("new unsafe");
+        // unsafe {println!("size {:?} bytes {:?}",array.inner.size, array.inner.data.mem_region.as_mut_slice().unwrap().len())};
+        // println!("elem per pe {:?}", elem_per_pe);
+        // for i in 0..num_pes{
+        //     println!("pe: {:?} {:?}",i,array.inner.num_elems_pe(i));
+        // }
+        // array.inner.data.print();
+        if full_array_size != array_size {
+            println!("WARNING: Array size {array_size} is less than number of pes {full_array_size}, each PE will not contain data");
+            array.sub_array(0..array_size)
+        } else {
+            array
+        }
+        // println!("after buffered ops");
+        // array.inner.data.print();
+    }
 }
 impl<T: Dist + 'static> UnsafeArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -363,6 +433,47 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.inner.data.team.clone()
     }
 
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        // let mut first = true;
+        while self
+            .inner
+            .data
+            .array_counters
+            .outstanding_reqs
+            .load(Ordering::SeqCst)
+            > 0
+            || self.inner.data.req_cnt.load(Ordering::SeqCst) > 0
+        {
+            // std::thread::yield_now();
+            // self.inner.data.team.flush();
+            // self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                //|| first{
+                println!(
+                    "in array await_all mype: {:?} cnt: {:?} {:?} {:?}",
+                    self.inner.data.team.world_pe,
+                    self.inner
+                        .data
+                        .array_counters
+                        .send_req_cnt
+                        .load(Ordering::SeqCst),
+                    self.inner
+                        .data
+                        .array_counters
+                        .outstanding_reqs
+                        .load(Ordering::SeqCst),
+                    self.inner.data.req_cnt.load(Ordering::SeqCst)
+                );
+                temp_now = Instant::now();
+                // first = false;
+            }
+        }
+        self.inner.data.task_group.await_all().await;
+        // println!("done in wait all {:?}",std::time::SystemTime::now());
+    }
+
     pub(crate) fn block_on_outstanding(&self, mode: DarcMode) {
         self.wait_all();
         // println!("block on outstanding");
@@ -373,6 +484,15 @@ impl<T: Dist + 'static> UnsafeArray<T> {
             .block_on(array_darc.block_on_outstanding(mode, 1)); //one for this instance of the array
     }
 
+    pub(crate) async fn await_on_outstanding(&self, mode: DarcMode) {
+        self.await_all().await;
+        // println!("block on outstanding");
+        // self.inner.data.print();
+        // let the_array: UnsafeArray<T> = self.clone();
+        let array_darc = self.inner.data.clone();
+        array_darc.block_on_outstanding(mode, 1).await;
+    }
+
     #[doc(alias = "Collective")]
     /// Convert this UnsafeArray into a (safe) [ReadOnlyArray][crate::array::ReadOnlyArray]
     ///
@@ -572,7 +692,44 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        input.team_into(team)
+        TeamInto::team_into(input, team)
+    }
+}
+
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let (local_vals, distribution) = input;
+        // println!("local_vals len: {:?}", local_vals.len());
+        team.async_barrier().await;
+        let local_sizes =
+            UnsafeArray::<usize>::async_new(team.clone(), team.num_pes, Distribution::Block).await;
+        unsafe {
+            local_sizes.local_as_mut_slice()[0] = local_vals.len();
+        }
+        team.async_barrier().await;
+        // local_sizes.barrier();
+        let mut size = 0;
+        let mut my_start = 0;
+        let my_pe = team.team_pe.expect("pe not part of team");
+        unsafe {
+            local_sizes
+                .buffered_onesided_iter(team.num_pes)
+                .into_iter()
+                .enumerate()
+                .for_each(|(i, local_size)| {
+                    size += local_size;
+                    if i < my_pe {
+                        my_start += local_size;
+                    }
+                });
+        }
+        let array = UnsafeArray::<T>::async_new(team.clone(), size, distribution).await;
+        if local_vals.len() > 0 {
+            unsafe { array.put(my_start, local_vals).await };
+        }
+        team.async_barrier().await;
+        array
     }
 }
 
@@ -613,8 +770,6 @@ impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
 
 impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
     fn from(array: AtomicArray<T>) -> Self {
-        // println!("unsafe from atomic");
-        // array.into_unsafe()
         match array {
             AtomicArray::NativeAtomicArray(array) => UnsafeArray::<T>::from(array),
             AtomicArray::GenericAtomicArray(array) => UnsafeArray::<T>::from(array),
@@ -624,8 +779,6 @@ impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<NativeAtomicArray<T>> for UnsafeArray<T> {
     fn from(array: NativeAtomicArray<T>) -> Self {
-        // println!("unsafe from native atomic");
-        // let array = array.into_data();
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -633,8 +786,6 @@ impl<T: Dist> From<NativeAtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<GenericAtomicArray<T>> for UnsafeArray<T> {
     fn from(array: GenericAtomicArray<T>) -> Self {
-        // println!("unsafe from generic atomic");
-        // let array = array.into_data();
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -642,7 +793,6 @@ impl<T: Dist> From<GenericAtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<LocalLockArray<T>> for UnsafeArray<T> {
     fn from(array: LocalLockArray<T>) -> Self {
-        // println!("unsafe from local lock atomic");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -650,7 +800,6 @@ impl<T: Dist> From<LocalLockArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<GlobalLockArray<T>> for UnsafeArray<T> {
     fn from(array: GlobalLockArray<T>) -> Self {
-        // println!("unsafe from global lock atomic");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -658,7 +807,6 @@ impl<T: Dist> From<GlobalLockArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<ReadOnlyArray<T>> for UnsafeArray<T> {
     fn from(array: ReadOnlyArray<T>) -> Self {
-        // println!("unsafe from read only");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index a677969a..4a9668a0 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, Distribution, LamellarArray, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, AsyncTeamInto, Distribution, LamellarArray, TeamFrom};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -141,7 +141,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_with_schedule(Schedule::Static, iter, d)
     }
@@ -155,7 +155,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = Collect {
             iter: iter.clone().monotonic(),
@@ -180,7 +180,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_async_with_schedule(Schedule::Static, iter, d)
     }
@@ -195,7 +195,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = CollectAsync {
             iter: iter.clone().monotonic(),
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 1ad136ee..16151573 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::local_iterator::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, Distribution, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -162,7 +162,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_with_schedule(Schedule::Static, iter, d)
     }
@@ -176,7 +176,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = Collect {
             iter: iter.clone().monotonic(),
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 26e45ec0..d699370c 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -552,6 +552,27 @@ impl LamellarTaskGroup {
         }
     }
 
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
+            // self.team.flush();
+            // self.team.scheduler.exec_task();
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                println!(
+                    "in task group wait_all mype: {:?} cnt: {:?} {:?}",
+                    self.team.world_pe,
+                    self.team.team_counters.send_req_cnt.load(Ordering::SeqCst),
+                    self.team
+                        .team_counters
+                        .outstanding_reqs
+                        .load(Ordering::SeqCst),
+                );
+                temp_now = Instant::now();
+            }
+        }
+    }
+
     pub(crate) fn exec_am_all_inner<F>(
         &self,
         am: F,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 97d85179..0ba82e24 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -14,9 +14,9 @@ pub(crate) mod work_stealing;
 use work_stealing::WorkStealing;
 
 #[cfg(feature = "tokio-executor")]
-pub(crate) mod tokio;
+pub(crate) mod tokio_executor;
 #[cfg(feature = "tokio-executor")]
-use tokio::TokioRt;
+use tokio_executor::TokioRt;
 
 // ACTIVE ENUM
 // since atomic enums would be another dependecy
@@ -223,6 +223,12 @@ impl Scheduler {
     }
 
     pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            println!(
+                "trying to call block on within a worker thread {:?}",
+                std::backtrace::Backtrace::capture()
+            )
+        }
         self.executor.block_on(task)
     }
 
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
deleted file mode 100644
index f9e14ac1..00000000
--- a/src/scheduler/tokio.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
-
-use tokio::runtime::Runtime;
-
-use tracing::*;
-
-use async_task::{Builder, Runnable};
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-use rand::prelude::*;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-
-static TASK_ID: AtomicUsize = AtomicUsize::new(0);
-
-#[derive(Debug)]
-pub(crate) struct TokioRt {
-    max_num_threads: usize,
-    rt: Runtime,
-}
-
-impl LamellarExecutor for TokioRt {
-    fn submit_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn submit_immediate_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn block_on<F: Future>(&self, task: F) -> F::Output {
-        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn force_shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn exec_task(&self) {
-        // I dont think tokio has a way to do this
-    }
-
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
-
-    fn num_workers(&self) -> usize {
-        self.max_num_threads
-    }
-}
-
-impl TokioRt {
-    pub(crate) fn new(num_workers: usize) -> TokioRt {
-        // println!("New TokioRT with {} workers", num_workers);
-        TokioRt {
-            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
-            rt: tokio::runtime::Builder::new_multi_thread()
-                .worker_threads(num_workers + 1)
-                .enable_all()
-                .build()
-                .unwrap(),
-        }
-    }
-}

From db330f33f6c075db576a29a71ed05b67a251f8b0 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Thu, 8 Feb 2024 10:49:15 -0800
Subject: [PATCH 006/116] renaming tokio.rs -> tokio_executor.rs

---
 src/scheduler/tokio_executor.rs | 88 +++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 src/scheduler/tokio_executor.rs

diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
new file mode 100644
index 00000000..f9e14ac1
--- /dev/null
+++ b/src/scheduler/tokio_executor.rs
@@ -0,0 +1,88 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+
+use tokio::runtime::Runtime;
+
+use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::Worker;
+use futures::Future;
+use futures_lite::FutureExt;
+use rand::prelude::*;
+use std::panic;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc; //, Weak};
+use std::thread;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+pub(crate) struct TokioRt {
+    max_num_threads: usize,
+    rt: Runtime,
+}
+
+impl LamellarExecutor for TokioRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl TokioRt {
+    pub(crate) fn new(num_workers: usize) -> TokioRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        TokioRt {
+            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            rt: tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(num_workers + 1)
+                .enable_all()
+                .build()
+                .unwrap(),
+        }
+    }
+}

From ded0d588fc7b3acd16720efed2e5fc7548a7058a Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 15 Feb 2024 09:54:15 -0800
Subject: [PATCH 007/116] all examples compile, not all complete

---
 .../active_message_examples/am_batch_tests.rs |  86 ++-
 examples/active_message_examples/am_local.rs  |   4 +-
 .../am_local_memregions.rs                    |   6 +-
 examples/array_examples/array_am.rs           |   2 +-
 examples/array_examples/array_batch_add.rs    |   8 +-
 .../array_consumer_schedules.rs               |   4 +-
 examples/array_examples/array_ops.rs          | 122 ++--
 examples/array_examples/array_put_get.rs      |   2 +-
 .../array_examples/atomic_compare_exchange.rs |   4 +-
 examples/array_examples/dist_array_reduce.rs  |   2 +-
 .../array_examples/distributed_iteration.rs   |  22 +-
 examples/array_examples/local_iteration.rs    |  22 +-
 examples/bandwidths/am_bw.rs                  |   2 +-
 examples/bandwidths/am_bw_get.rs              |   2 +-
 examples/bandwidths/atomic_array_get_bw.rs    |   4 +-
 examples/bandwidths/atomic_array_put_bw.rs    |   6 +-
 .../global_lock_atomic_array_get_bw.rs        |   4 +-
 .../global_lock_atomic_array_put_bw.rs        |   6 +-
 .../local_lock_atomic_array_get_bw.rs         |   4 +-
 .../local_lock_atomic_array_put_bw.rs         |   7 +-
 examples/bandwidths/readonly_array_get_bw.rs  |   4 +-
 .../readonly_array_get_unchecked_bw.rs        |   2 +-
 examples/bandwidths/task_group_am_bw.rs       |   2 +-
 examples/bandwidths/unsafe_array_get_bw.rs    |   4 +-
 .../unsafe_array_get_unchecked_bw.rs          |   2 +-
 examples/bandwidths/unsafe_array_put_bw.rs    |   2 +-
 examples/bandwidths/unsafe_array_store_bw.rs  |   4 +-
 examples/darc_examples/darc.rs                |   4 +-
 examples/darc_examples/stress_test.rs         |   6 +-
 examples/hello_world/hello_world_array.rs     |   2 +-
 examples/kernels/dft_proxy.rs                 |  30 +-
 examples/kernels/parallel_array_gemm.rs       |  10 +-
 .../kernels/parallel_blocked_array_gemm.rs    |   9 +-
 examples/kernels/serial_array_gemm.rs         |   9 +-
 examples/rdma_examples/rdma_am.rs             |   2 +-
 examples/team_examples/custom_team_arch.rs    |   6 +-
 examples/team_examples/random_team.rs         |   2 +-
 examples/team_examples/team_am.rs             |   4 +-
 src/active_messaging.rs                       |   8 +-
 .../batching/simple_batcher.rs                |  18 +-
 .../batching/team_am_batcher.rs               |  89 ++-
 .../registered_active_message.rs              |  87 +--
 src/array.rs                                  |  21 +
 src/array/iterator/distributed_iterator.rs    |   7 +-
 .../distributed_iterator/consumer/collect.rs  |   2 +-
 .../local_iterator/consumer/collect.rs        |   2 +-
 src/array/unsafe.rs                           |  70 ---
 src/array/unsafe/iteration/distributed.rs     |   2 +-
 src/array/unsafe/operations.rs                |   1 -
 src/darc.rs                                   |  40 --
 src/lamellar_request.rs                       |  18 +-
 src/lamellar_world.rs                         |  35 +-
 src/scheduler.rs                              |  68 ++-
 src/scheduler/numa_work_stealing.rs           | 552 -----------------
 src/scheduler/numa_work_stealing2.rs          | 569 ------------------
 src/scheduler/tokio_executor.rs               |  14 +-
 tests/array/arithmetic_ops/add_test.rs        |  56 +-
 tests/array/arithmetic_ops/div_test.rs        |  14 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |  20 +-
 tests/array/arithmetic_ops/fetch_div_test.rs  |  10 +-
 tests/array/arithmetic_ops/fetch_mul_test.rs  |   8 +-
 tests/array/arithmetic_ops/fetch_sub_test.rs  |   8 +-
 tests/array/arithmetic_ops/mul_test.rs        |  14 +-
 tests/array/arithmetic_ops/sub_test.rs        |  20 +-
 .../array/atomic_ops/compare_exchange_test.rs |   8 +-
 tests/array/atomic_ops/load_store_test.rs     |  14 +-
 tests/array/atomic_ops/swap_test.rs           |   8 +-
 tests/array/bitwise_ops/and_test.rs           |  14 +-
 tests/array/bitwise_ops/fetch_and_test.rs     |   8 +-
 tests/array/bitwise_ops/fetch_or_test.rs      |   8 +-
 tests/array/bitwise_ops/fetch_xor_test.rs     |   8 +-
 tests/array/bitwise_ops/or_test.rs            |  14 +-
 tests/array/bitwise_ops/xor_test.rs           |  14 +-
 tests/array/rdma/blocking_get_test.rs         |  21 +-
 tests/array/rdma/get_test.rs                  |  27 +-
 tests/array/rdma/put_test.rs                  |  14 +-
 76 files changed, 639 insertions(+), 1695 deletions(-)
 delete mode 100644 src/scheduler/numa_work_stealing.rs
 delete mode 100644 src/scheduler/numa_work_stealing2.rs

diff --git a/examples/active_message_examples/am_batch_tests.rs b/examples/active_message_examples/am_batch_tests.rs
index 0c15ef6f..9fb93b2a 100644
--- a/examples/active_message_examples/am_batch_tests.rs
+++ b/examples/active_message_examples/am_batch_tests.rs
@@ -127,100 +127,123 @@ fn main() {
     world.barrier();
     println!("after first barrier");
     // if my_pe == 0 {
+    let mut cnts = vec![0; 12];
     let s = Instant::now();
+    // for pe in 0..3 {
+    // for am_type in 1..2 {
     for i in 0..10000 {
         let pe = pe_rng.sample(&mut rng);
         let len1 = buf_rng.sample(&mut rng);
         let len2 = buf_rng.sample(&mut rng);
+        let am_type = am_rng.sample(&mut rng);
+        // let pe = i % (num_pes + 1);
+        // println!("{}", pe);
         if pe == num_pes {
-            match am_rng.sample(&mut rng) {
+            // let am_type = 7;
+            match am_type {
                 0 => {
-                    world.exec_am_all(AmEmpty {});
+                    let _ = world.exec_am_all(AmEmpty {});
+                    cnts[0] += 1;
                 } //batch msg ,batch unit return
                 1 => {
-                    world.exec_am_all(AmEmptyReturnAmEmpty {});
+                    let _ = world.exec_am_all(AmEmptyReturnAmEmpty {});
+                    cnts[1] += 1;
                 } //batch msg, batch return am
                 2 => {
-                    world.exec_am_all(AmNoReturn {
+                    let _ = world.exec_am_all(AmNoReturn {
                         my_pe: my_pe,
                         index: i,
                         data: vec![i; 1],
                     });
+                    cnts[2] += 1;
                 } //batch msg ,batch unit return
                 3 => {
-                    world.exec_am_all(AmNoReturn {
+                    let _ = world.exec_am_all(AmNoReturn {
                         my_pe: my_pe,
                         index: i,
                         data: vec![i; len1],
                     });
+                    cnts[3] += 1;
                 } //direct msg , batch unit return
                 4 => {
-                    world.exec_am_all(AmReturnVec {
+                    let _ = world.exec_am_all(AmReturnVec {
                         my_pe: my_pe,
                         vec_size: 1,
                         data: vec![i; 1],
                     });
+                    cnts[4] += 1;
                 } //batch message, batch return
                 5 => {
-                    world.exec_am_all(AmReturnVec {
+                    let _ = world.exec_am_all(AmReturnVec {
                         my_pe: my_pe,
                         vec_size: 1,
                         data: vec![i; len1],
                     });
+                    cnts[5] += 1;
                 } //direct msg, batch return
                 6 => {
-                    world.exec_am_all(AmReturnVec {
+                    let _ = world.exec_am_all(AmReturnVec {
                         my_pe: my_pe,
                         vec_size: 100000,
                         data: vec![i; 1],
                     });
+                    cnts[6] += 1;
                 } //batch message, direct return
                 7 => {
-                    world.exec_am_all(AmReturnVec {
+                    let _ = world.exec_am_all(AmReturnVec {
                         my_pe: my_pe,
                         vec_size: 100000,
                         data: vec![i; len1],
                     });
+                    cnts[7] += 1;
                 } //direct msg, direct return
                 8 => {
-                    world.exec_am_all(InitialAMVec {
+                    let _ = world.exec_am_all(InitialAMVec {
                         val1: 1,
                         val2: hostname::get().unwrap().to_string_lossy().to_string(),
                         vec: vec![i; 1],
                     });
+                    cnts[8] += 1;
                 } //batch msg ,batch return
                 9 => {
-                    world.exec_am_all(InitialAMVec {
+                    let _ = world.exec_am_all(InitialAMVec {
                         val1: 1,
                         val2: hostname::get().unwrap().to_string_lossy().to_string(),
                         vec: vec![i; len1],
                     });
+                    cnts[9] += 1;
                 } //direct msg , batch return
                 10 => {
-                    world.exec_am_all(InitialAMVec {
+                    let _ = world.exec_am_all(InitialAMVec {
                         val1: 100000,
                         val2: hostname::get().unwrap().to_string_lossy().to_string(),
                         vec: vec![i; 1],
                     });
+                    cnts[10] += 1;
                 } //batch message, direct return
                 _ => {
-                    world.exec_am_all(InitialAMVec {
+                    let _ = world.exec_am_all(InitialAMVec {
                         val1: 100000,
                         val2: hostname::get().unwrap().to_string_lossy().to_string(),
                         vec: vec![i; len1],
                     });
+                    cnts[11] += 1;
                 } //direct msg, direct return
             }
         } else {
-            match am_rng.sample(&mut rng) {
+            // let am_type = am_rng.sample(&mut rng);
+            // let am_type = 7;
+            match am_type {
                 0 => {
-                    world.exec_am_pe(pe, AmEmpty {});
+                    let _ = world.exec_am_pe(pe, AmEmpty {});
+                    cnts[0] += 1;
                 } //batch msg ,batch unit return
                 1 => {
-                    world.exec_am_pe(pe, AmEmptyReturnAmEmpty {});
+                    let _ = world.exec_am_pe(pe, AmEmptyReturnAmEmpty {});
+                    cnts[1] += 1;
                 } //batch msg, batch return am
                 2 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         AmNoReturn {
                             my_pe: my_pe,
@@ -228,9 +251,10 @@ fn main() {
                             data: vec![i; 1],
                         },
                     );
+                    cnts[2] += 1;
                 } //batch msg ,batch unit return
                 3 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         AmNoReturn {
                             my_pe: my_pe,
@@ -238,9 +262,10 @@ fn main() {
                             data: vec![i; len1],
                         },
                     );
+                    cnts[3] += 1;
                 } //direct msg , batch unit return
                 4 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         AmReturnVec {
                             my_pe: my_pe,
@@ -248,9 +273,10 @@ fn main() {
                             data: vec![i; 1],
                         },
                     );
+                    cnts[4] += 1;
                 } //batch message, batch return
                 5 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         AmReturnVec {
                             my_pe: my_pe,
@@ -258,9 +284,10 @@ fn main() {
                             data: vec![i; len1],
                         },
                     );
+                    cnts[5] += 1;
                 } //direct msg, batch return
                 6 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         AmReturnVec {
                             my_pe: my_pe,
@@ -268,9 +295,10 @@ fn main() {
                             data: vec![i; 1],
                         },
                     );
+                    cnts[6] += 1;
                 } //batch message, direct return
                 7 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         AmReturnVec {
                             my_pe: my_pe,
@@ -278,9 +306,10 @@ fn main() {
                             data: vec![i; len1],
                         },
                     );
+                    cnts[7] += 1;
                 } //direct msg, direct return
                 8 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         InitialAMVec {
                             val1: 1,
@@ -288,9 +317,10 @@ fn main() {
                             vec: vec![i; 1],
                         },
                     );
+                    cnts[8] += 1;
                 } //batch msg ,batch return
                 9 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         InitialAMVec {
                             val1: 1,
@@ -298,9 +328,10 @@ fn main() {
                             vec: vec![i; len1],
                         },
                     );
+                    cnts[9] += 1;
                 } //direct msg , batch return
                 10 => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         InitialAMVec {
                             val1: len2,
@@ -308,9 +339,10 @@ fn main() {
                             vec: vec![i; 1],
                         },
                     );
+                    cnts[10] += 1;
                 } //batch message, direct return
                 _ => {
-                    world.exec_am_pe(
+                    let _ = world.exec_am_pe(
                         pe,
                         InitialAMVec {
                             val1: len2,
@@ -318,11 +350,13 @@ fn main() {
                             vec: vec![i; len1],
                         },
                     );
+                    cnts[11] += 1;
                 } //direct msg, direct return
             }
         }
     }
     println!("issue time: {:?}", s.elapsed().as_secs_f64());
+    println!("cnts: {:?}", cnts);
     world.wait_all();
     println!("local finished time: {:?}", s.elapsed().as_secs_f64());
     world.barrier();
diff --git a/examples/active_message_examples/am_local.rs b/examples/active_message_examples/am_local.rs
index b13df30b..1a891dc6 100644
--- a/examples/active_message_examples/am_local.rs
+++ b/examples/active_message_examples/am_local.rs
@@ -88,7 +88,7 @@ fn main() {
         println!("---------------------------------------------------------------");
         println!("Testing local am no return");
         for i in 0..map.len() {
-            world.exec_am_local(AmNoReturn {
+            let _ = world.exec_am_local(AmNoReturn {
                 my_id: i,
                 data: map.clone(),
                 index: index.clone(),
@@ -99,7 +99,7 @@ fn main() {
         println!("---------------------------------------------------------------");
         println!("Testing local am no return");
         for i in 0..map.len() {
-            world.exec_am_local(AmReturnUsize {
+            let _ = world.exec_am_local(AmReturnUsize {
                 my_id: i,
                 data: map.clone(),
                 index: index.clone(),
diff --git a/examples/active_message_examples/am_local_memregions.rs b/examples/active_message_examples/am_local_memregions.rs
index 7f0d7711..5733c8c7 100644
--- a/examples/active_message_examples/am_local_memregions.rs
+++ b/examples/active_message_examples/am_local_memregions.rs
@@ -30,7 +30,7 @@ impl LamellarAM for DataAM {
             for _i in 0..self.width {
                 let pe = pes.sample(&mut rng);
                 // println!("sending {:?} to {:?}",path,pe);
-                lamellar::team.exec_am_pe(
+                let _ = lamellar::team.exec_am_pe(
                     pe,
                     DataAM {
                         array: self.array.clone(),
@@ -86,7 +86,7 @@ fn main() {
     let width = 5;
     for _i in 0..width {
         let pe = pes.sample(&mut rng) / 2; //since both teams consist of half the number of pes as the world
-        first_half_team.exec_am_pe(
+        let _ = first_half_team.exec_am_pe(
             pe,
             DataAM {
                 array: array.clone(),
@@ -95,7 +95,7 @@ fn main() {
                 path: vec![my_pe],
             },
         );
-        odd_team.exec_am_pe(
+        let _ = odd_team.exec_am_pe(
             pe,
             DataAM {
                 array: array.clone(),
diff --git a/examples/array_examples/array_am.rs b/examples/array_examples/array_am.rs
index a68a3ff7..7367d140 100644
--- a/examples/array_examples/array_am.rs
+++ b/examples/array_examples/array_am.rs
@@ -101,7 +101,7 @@ fn main() {
     world.barrier();
     let mut index = 0;
     while index < ARRAY_LEN / num_pes {
-        world.exec_am_all(RdmaAM {
+        let _ = world.exec_am_all(RdmaAM {
             array: array.clone(),
             orig_pe: my_pe,
             index: index,
diff --git a/examples/array_examples/array_batch_add.rs b/examples/array_examples/array_batch_add.rs
index 74fef50b..698e28f1 100644
--- a/examples/array_examples/array_batch_add.rs
+++ b/examples/array_examples/array_batch_add.rs
@@ -38,7 +38,7 @@ fn main() {
 
         array.barrier();
         let timer = std::time::Instant::now();
-        array.batch_add(indices.clone(), 1);
+        let _ = array.batch_add(indices.clone(), 1);
         if my_pe == 0 {
             println!("{:?}", timer.elapsed());
         }
@@ -53,7 +53,7 @@ fn main() {
 
         array.barrier();
         let mut timer = std::time::Instant::now();
-        array.batch_add(indices.clone(), 1);
+        let _ = array.batch_add(indices.clone(), 1);
         if my_pe == 0 {
             println!("{:?}", timer.elapsed());
         }
@@ -74,7 +74,7 @@ fn main() {
             if bufs[pe].len() == num_per_batch {
                 let mut buf = Vec::with_capacity(num_per_batch);
                 std::mem::swap(&mut bufs[pe], &mut buf);
-                world.exec_am_pe(
+                let _ = world.exec_am_pe(
                     pe,
                     AddAm {
                         array: array.clone(),
@@ -85,7 +85,7 @@ fn main() {
         }
         for (pe, buf) in bufs.drain(..).enumerate() {
             if buf.len() > 0 {
-                world.exec_am_pe(
+                let _ = world.exec_am_pe(
                     pe,
                     AddAm {
                         array: array.clone(),
diff --git a/examples/array_examples/array_consumer_schedules.rs b/examples/array_examples/array_consumer_schedules.rs
index 64288296..2ad821a7 100644
--- a/examples/array_examples/array_consumer_schedules.rs
+++ b/examples/array_examples/array_consumer_schedules.rs
@@ -14,7 +14,7 @@ fn for_each_with_schedule(
 ) {
     let timer = Instant::now();
     let tc = thread_cnts.clone();
-    array
+    let _ = array
         .local_iter()
         .filter(|e| e.load() % 2 == 0)
         .for_each_with_schedule(schedule, move |e| {
@@ -109,7 +109,7 @@ fn main() {
     let _my_pe = world.my_pe();
     let _num_pes = world.num_pes();
     let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    block_array
+    let _ = block_array
         .dist_iter_mut()
         .enumerate()
         .for_each(move |(i, e)| e.store(i));
diff --git a/examples/array_examples/array_ops.rs b/examples/array_examples/array_ops.rs
index f311da92..f5932a5a 100644
--- a/examples/array_examples/array_ops.rs
+++ b/examples/array_examples/array_ops.rs
@@ -84,7 +84,7 @@ fn test_add<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     add_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -92,7 +92,7 @@ fn test_add<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.add(i, add_val);
+        let _ = array.add(i, add_val);
     }
     array.wait_all();
     array.barrier();
@@ -121,7 +121,7 @@ fn test_sub<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     sub_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -129,7 +129,7 @@ fn test_sub<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.sub(i, sub_val);
+        let _ = array.sub(i, sub_val);
     }
     array.wait_all();
     array.barrier();
@@ -152,7 +152,7 @@ fn test_mul<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     mul_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -160,7 +160,7 @@ fn test_mul<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.mul(i, mul_val);
+        let _ = array.mul(i, mul_val);
     }
     array.wait_all();
     array.barrier();
@@ -183,7 +183,7 @@ fn test_div<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     div_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -191,7 +191,7 @@ fn test_div<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.div(i, div_val);
+        let _ = array.div(i, div_val);
     }
     array.wait_all();
     array.barrier();
@@ -214,7 +214,7 @@ fn test_rem<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     rem_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -222,7 +222,7 @@ fn test_rem<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.rem(i, rem_val);
+        let _ = array.rem(i, rem_val);
     }
     array.wait_all();
     array.barrier();
@@ -245,7 +245,7 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
     init_val: T,
     and_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -253,13 +253,13 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.bit_and(i, and_val);
+        let _ = array.bit_and(i, and_val);
     }
     array.wait_all();
     array.barrier();
     array.print();
     array.barrier();
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -281,7 +281,7 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     init_val: T,
     or_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -289,13 +289,13 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.bit_or(i, or_val);
+        let _ = array.bit_or(i, or_val);
     }
     array.wait_all();
     array.barrier();
     array.print();
     array.barrier();
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -317,7 +317,7 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     init_val: T,
     xor_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -325,13 +325,13 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.bit_xor(i, xor_val);
+        let _ = array.bit_xor(i, xor_val);
     }
     array.wait_all();
     array.barrier();
     array.print();
     array.barrier();
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -355,7 +355,7 @@ fn test_store_load<T: std::fmt::Debug + ElementOps + 'static>(
     my_pe: usize,
     num_pes: usize,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -363,7 +363,7 @@ fn test_store_load<T: std::fmt::Debug + ElementOps + 'static>(
     array.print();
     array.barrier();
     for i in (my_pe..array.len()).step_by(num_pes) {
-        array.store(i, store_val);
+        let _ = array.store(i, store_val);
     }
     array.wait_all();
     array.barrier();
@@ -387,7 +387,7 @@ fn test_shl<T: std::fmt::Debug + ElementShiftOps + 'static>(
     init_val: T,
     shl_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -395,7 +395,7 @@ fn test_shl<T: std::fmt::Debug + ElementShiftOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.shl(i, shl_val);
+        let _ = array.shl(i, shl_val);
     }
     array.wait_all();
     array.barrier();
@@ -418,7 +418,7 @@ fn test_shr<T: std::fmt::Debug + ElementShiftOps + 'static>(
     init_val: T,
     shr_val: T,
 ) {
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| elem.store(init_val));
     array.wait_all();
@@ -426,7 +426,7 @@ fn test_shr<T: std::fmt::Debug + ElementShiftOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        array.shr(i, shr_val);
+        let _ = array.shr(i, shr_val);
     }
     array.wait_all();
     array.barrier();
@@ -464,25 +464,25 @@ fn main() {
         Custom { int: 0, float: 0.0 },
         Custom { int: 1, float: 1.0 },
     );
-    (&array_u8).add(3, 1);
+    let _ = (&array_u8).add(3, 1);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).add(3, 1);
+    let _ = (&array_i128).add(3, 1);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_f64).add(3, 1.0);
+    let _ = (&array_f64).add(3, 1.0);
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    (&array_custom).add(3, Custom { int: 1, float: 1.0 });
+    let _ = (&array_custom).add(3, Custom { int: 1, float: 1.0 });
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -500,25 +500,25 @@ fn main() {
         },
         Custom { int: 1, float: 1.0 },
     );
-    (&array_u8).sub(3, 1);
+    let _ = (&array_u8).sub(3, 1);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).sub(3, -1);
+    let _ = (&array_i128).sub(3, -1);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_f64).sub(3, 1.0);
+    let _ = (&array_f64).sub(3, 1.0);
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    (&array_custom).sub(3, Custom { int: 1, float: 1.0 });
+    let _ = (&array_custom).sub(3, Custom { int: 1, float: 1.0 });
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -534,25 +534,25 @@ fn main() {
         Custom { int: 1, float: 1.0 },
         Custom { int: 2, float: 2.5 },
     );
-    (&array_u8).mul(3, 2);
+    let _ = (&array_u8).mul(3, 2);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).mul(3, -2);
+    let _ = (&array_i128).mul(3, -2);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_f64).mul(3, 2.5);
+    let _ = (&array_f64).mul(3, 2.5);
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    (&array_custom).mul(3, Custom { int: 1, float: 2.5 });
+    let _ = (&array_custom).mul(3, Custom { int: 1, float: 2.5 });
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -571,25 +571,25 @@ fn main() {
         },
         Custom { int: 2, float: 2.5 },
     );
-    (&array_u8).div(3, 2);
+    let _ = (&array_u8).div(3, 2);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).div(3, 2);
+    let _ = (&array_i128).div(3, 2);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_f64).div(3, 2.5);
+    let _ = (&array_f64).div(3, 2.5);
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    (&array_custom).div(3, Custom { int: 1, float: 2.5 });
+    let _ = (&array_custom).div(3, Custom { int: 1, float: 2.5 });
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -608,25 +608,25 @@ fn main() {
         },
         Custom { int: 2, float: 2.5 },
     );
-    (&array_u8).rem(3, 2);
+    let _ = (&array_u8).rem(3, 2);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).rem(3, 2);
+    let _ = (&array_i128).rem(3, 2);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_f64).rem(3, 2.5);
+    let _ = (&array_f64).rem(3, 2.5);
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    (&array_custom).rem(3, Custom { int: 1, float: 2.5 });
+    let _ = (&array_custom).rem(3, Custom { int: 1, float: 2.5 });
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -639,13 +639,13 @@ fn main() {
     test_and(array_u8.clone(), 255, and_val);
     test_and(array_i128.clone(), 1023, and_val.into());
 
-    (&array_u8).bit_and(3, 1 << num_pes);
+    let _ = (&array_u8).bit_and(3, 1 << num_pes);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).bit_and(3, 1 << num_pes);
+    let _ = (&array_i128).bit_and(3, 1 << num_pes);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
@@ -656,12 +656,12 @@ fn main() {
     let or_val = 1 << my_pe;
     test_or(array_u8.clone(), 0, or_val);
     test_or(array_i128.clone(), 0, or_val.into());
-    (&array_u8).bit_or(3, 1 << num_pes);
+    let _ = (&array_u8).bit_or(3, 1 << num_pes);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
-    (&array_i128).bit_or(3, 1 << num_pes);
+    let _ = (&array_i128).bit_or(3, 1 << num_pes);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
@@ -672,12 +672,12 @@ fn main() {
     let xor_val = 1 << my_pe;
     test_xor(array_u8.clone(), 0, xor_val);
     test_xor(array_i128.clone(), 0, xor_val.into());
-    (&array_u8).bit_xor(3, 1 << num_pes);
+    let _ = (&array_u8).bit_xor(3, 1 << num_pes);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
-    (&array_i128).bit_xor(3, 1 << num_pes);
+    let _ = (&array_i128).bit_xor(3, 1 << num_pes);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
@@ -699,25 +699,25 @@ fn main() {
         my_pe,
         num_pes,
     );
-    (&array_u8).store(3, num_pes as u8);
+    let _ = (&array_u8).store(3, num_pes as u8);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).store(3, num_pes as i128);
+    let _ = (&array_i128).store(3, num_pes as i128);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_f64).store(3, num_pes as f64);
+    let _ = (&array_f64).store(3, num_pes as f64);
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    (&array_custom).store(
+    let _ = (&array_custom).store(
         3,
         Custom {
             int: num_pes as usize,
@@ -744,19 +744,19 @@ fn main() {
             float: 0.0,
         },
     );
-    (&array_u8).shl(1, 3);
+    let _ = (&array_u8).shl(1, 3);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).shl(1, 63);
+    let _ = (&array_i128).shl(1, 63);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_custom).shl(
+    let _ = (&array_custom).shl(
         1,
         Custom {
             int: 15,
@@ -782,19 +782,19 @@ fn main() {
             float: 0.0,
         },
     );
-    (&array_u8).shr(1, 3);
+    let _ = (&array_u8).shr(1, 3);
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    (&array_i128).shr(1, 63);
+    let _ = (&array_i128).shr(1, 63);
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    (&array_custom).shr(
+    let _ = (&array_custom).shr(
         1,
         Custom {
             int: 15,
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index d162d171..9d1463f1 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -2,7 +2,7 @@ use lamellar::array::prelude::*;
 use lamellar::memregion::prelude::*;
 
 fn initialize_array(array: &UnsafeArray<usize>) {
-    unsafe { array.dist_iter_mut().for_each(|x| *x = 0) };
+    let _ = unsafe { array.dist_iter_mut().for_each(|x| *x = 0) };
     array.wait_all();
     array.barrier();
 }
diff --git a/examples/array_examples/atomic_compare_exchange.rs b/examples/array_examples/atomic_compare_exchange.rs
index 4cba0ab5..94612ad1 100644
--- a/examples/array_examples/atomic_compare_exchange.rs
+++ b/examples/array_examples/atomic_compare_exchange.rs
@@ -24,7 +24,7 @@ fn main() {
     let my_pe = world.my_pe();
 
     let array = AtomicArray::<usize>::new(world.team(), num_pes * 2, Distribution::Block);
-    array.dist_iter_mut().for_each(|x| x.store(0)); //initialize array -- use atomic store
+    let _ = array.dist_iter_mut().for_each(|x| x.store(0)); //initialize array -- use atomic store
     array.wait_all();
     array.barrier();
 
@@ -46,7 +46,7 @@ fn main() {
     array.print();
 
     let array_2 = AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic);
-    array_2.dist_iter_mut().for_each(|x| x.store(0.0));
+    let _ = array_2.dist_iter_mut().for_each(|x| x.store(0.0));
     array_2.wait_all();
     array_2.barrier();
 
diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index 6a193683..15312ad9 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -154,7 +154,7 @@ fn main() {
             .for_each(|x| println!("x: {:?}", x))
     });
     let block_array = block_array.into_read_only();
-    block_array.sum();
+    let _ = block_array.sum();
     // block_array.dist_iter().for_each(|x| println!("x: {:?}", x));
     // block_array.for_each(|x| println!("x: {:?}", x));
     // cyclic_array.for_each_mut(|x| *x += *x);
diff --git a/examples/array_examples/distributed_iteration.rs b/examples/array_examples/distributed_iteration.rs
index 20745ad7..f75b1217 100644
--- a/examples/array_examples/distributed_iteration.rs
+++ b/examples/array_examples/distributed_iteration.rs
@@ -21,10 +21,10 @@ fn main() {
     // we currently provide the "for_each" driver which will execute a closure on every element in the distributed array (concurrently)
 
     //for example lets initialize our arrays, where we store the value of my_pe to each local element a pe owns
-    block_dist_iter
+    let _ = block_dist_iter
         .enumerate()
         .for_each(move |(i, elem)| elem.store(i));
-    cyclic_dist_iter.for_each(move |elem| elem.store(my_pe));
+    let _ = cyclic_dist_iter.for_each(move |elem| elem.store(my_pe));
     //for_each is asynchronous so we must wait on the array for the operations to complete
     // we are working on providing a request handle which can be used to check for completion
     block_array.wait_all();
@@ -54,7 +54,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate step_by");
-    block_array
+    let _ = block_array
         .dist_iter()
         .skip(2)
         .enumerate()
@@ -91,7 +91,7 @@ fn main() {
     println!("--------------------------------------------------------");
     println!("cyclic skip enumerate");
 
-    cyclic_array
+    let _ = cyclic_array
         .dist_iter()
         .enumerate()
         .skip(2)
@@ -130,7 +130,7 @@ fn main() {
     println!("cyclic enumerate map async for each");
     cyclic_array.print();
     let barray = block_array.clone();
-    cyclic_array
+    let _ = cyclic_array
         .dist_iter()
         .enumerate()
         .map(move |(i, elem)| {
@@ -178,7 +178,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter");
-    block_array
+    let _ = block_array
         .dist_iter()
         .enumerate()
         .filter(|(_, elem)| elem.load() % 4 == 0)
@@ -196,7 +196,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter_map");
-    block_array
+    let _ = block_array
         .dist_iter()
         .enumerate()
         .filter_map(|(i, elem)| {
@@ -238,7 +238,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate");
-    block_array
+    let _ = block_array
         .dist_iter()
         .skip(10)
         .enumerate()
@@ -257,7 +257,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip  step_by enumerate");
-    block_array
+    let _ = block_array
         .dist_iter()
         .skip(10)
         .step_by(3)
@@ -277,7 +277,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block take skip enumerate");
-    block_array
+    let _ = block_array
         .dist_iter()
         .take(60)
         .skip(10)
@@ -297,7 +297,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block take skip take enumerate");
-    block_array
+    let _ = block_array
         .dist_iter()
         .take(60)
         .skip(10)
diff --git a/examples/array_examples/local_iteration.rs b/examples/array_examples/local_iteration.rs
index 2784fcba..a65df59a 100644
--- a/examples/array_examples/local_iteration.rs
+++ b/examples/array_examples/local_iteration.rs
@@ -21,10 +21,10 @@ fn main() {
     // we currently provide the "for_each" driver which will execute a closure on every element in the distributed array (concurrently)
 
     //for example lets initialize our arrays, where we store the value of my_pe to each local element a pe owns
-    block_local_iter
+    let _ = block_local_iter
         .enumerate()
         .for_each(move |(i, elem)| elem.store(i));
-    cyclic_local_iter.for_each(move |elem| elem.store(my_pe));
+    let _ = cyclic_local_iter.for_each(move |elem| elem.store(my_pe));
     //for_each is asynchronous so we must wait on the array for the operations to complete
     // we are working on providing a request handle which can be used to check for completion
     block_array.wait_all();
@@ -41,7 +41,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate step_by");
-    block_array
+    let _ = block_array
         .local_iter()
         .skip(2)
         .enumerate()
@@ -78,7 +78,7 @@ fn main() {
     println!("--------------------------------------------------------");
     println!("cyclic skip enumerate");
 
-    cyclic_array
+    let _ = cyclic_array
         .local_iter()
         .enumerate()
         .skip(2)
@@ -117,7 +117,7 @@ fn main() {
     println!("cyclic enumerate map async for each");
     cyclic_array.print();
     let barray = block_array.clone();
-    cyclic_array
+    let _ = cyclic_array
         .local_iter()
         .enumerate()
         .map(move |(i, elem)| {
@@ -165,7 +165,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter");
-    block_array
+    let _ = block_array
         .local_iter()
         .enumerate()
         .filter(|(_, elem)| {
@@ -191,7 +191,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter_map");
-    block_array
+    let _ = block_array
         .local_iter()
         .enumerate()
         .filter_map(|(i, elem)| {
@@ -226,7 +226,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate");
-    block_array
+    let _ = block_array
         .local_iter()
         .skip(10)
         .enumerate()
@@ -245,7 +245,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip  step_by enumerate");
-    block_array
+    let _ = block_array
         .local_iter()
         .skip(10)
         .step_by(3)
@@ -265,7 +265,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block take skip enumerate");
-    block_array
+    let _ = block_array
         .local_iter()
         .take(60)
         .skip(10)
@@ -285,7 +285,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block take skip take enumerate");
-    block_array
+    let _ = block_array
         .local_iter()
         .take(60)
         .skip(10)
diff --git a/examples/bandwidths/am_bw.rs b/examples/bandwidths/am_bw.rs
index bcc236fc..c45296af 100644
--- a/examples/bandwidths/am_bw.rs
+++ b/examples/bandwidths/am_bw.rs
@@ -58,7 +58,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let d = _data.clone();
                 sub_time += sub_timer.elapsed().as_secs_f64();
-                world.exec_am_pe(num_pes - 1, DataAM { data: d }); //we explicity  captured _data and transfer it even though we do nothing with it
+                let _ = world.exec_am_pe(num_pes - 1, DataAM { data: d }); //we explicity  captured _data and transfer it even though we do nothing with it
 
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/am_bw_get.rs b/examples/bandwidths/am_bw_get.rs
index d495d70e..b558c414 100644
--- a/examples/bandwidths/am_bw_get.rs
+++ b/examples/bandwidths/am_bw_get.rs
@@ -76,7 +76,7 @@ fn main() {
         if my_pe == num_pes - 1 {
             for _j in (0..(2_u64.pow(exp))).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
-                world.exec_am_pe(
+                let _ = world.exec_am_pe(
                     0,
                     DataAM {
                         array: array.clone(),
diff --git a/examples/bandwidths/atomic_array_get_bw.rs b/examples/bandwidths/atomic_array_get_bw.rs
index 9c7c5104..9d607594 100644
--- a/examples/bandwidths/atomic_array_get_bw.rs
+++ b/examples/bandwidths/atomic_array_get_bw.rs
@@ -20,7 +20,7 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    array
+    let _ = array
         .local_iter_mut()
         .for_each(move |elem| *elem = num_pes as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, so use locallock for initializiation
     let array = array.into_atomic(); //this enforces a wait_all and barrier
@@ -56,7 +56,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
                 }
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
diff --git a/examples/bandwidths/atomic_array_put_bw.rs b/examples/bandwidths/atomic_array_put_bw.rs
index 038e7cf2..f621abb1 100644
--- a/examples/bandwidths/atomic_array_put_bw.rs
+++ b/examples/bandwidths/atomic_array_put_bw.rs
@@ -20,7 +20,7 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| *elem = 255 as u8); //this is can be pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, local lock tends to perform better
     let mut array = array.into_atomic(); //so we simply convert the LocalLockArray array to atomic after initalization
@@ -53,7 +53,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg) };
+                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg) };
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
                 sub_time += sub_timer.elapsed().as_secs_f64();
@@ -103,7 +103,7 @@ fn main() {
         //     }
         // };
         let temp = array.into_local_lock();
-        temp.dist_iter_mut().for_each(move |elem| *elem = 255 as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements
+        let _ = temp.dist_iter_mut().for_each(move |elem| *elem = 255 as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements
         array = temp.into_atomic();
         world.barrier();
     }
diff --git a/examples/bandwidths/global_lock_atomic_array_get_bw.rs b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
index 92a595eb..496be0ef 100644
--- a/examples/bandwidths/global_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
@@ -24,7 +24,7 @@ fn main() {
         //     *i = num_pes as u8;
         // }
     }
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| *elem = num_pes as u8);
     array.wait_all();
@@ -59,7 +59,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
                 }
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index fa3f257e..2c072a7e 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -21,7 +21,7 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| *elem = 255 as u8);
     array.wait_all();
@@ -55,7 +55,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                unsafe { array.put(j, sub_reg) };
+                let _ = unsafe { array.put(j, sub_reg) };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
@@ -96,7 +96,7 @@ fn main() {
         );
         }
         bws.push((sum as f64 / 1048576.0) / cur_t);
-        array
+        let _ = array
             .dist_iter_mut()
             .for_each(move |elem| *elem = 255 as u8);
         array.wait_all();
diff --git a/examples/bandwidths/local_lock_atomic_array_get_bw.rs b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
index c064af26..94686541 100644
--- a/examples/bandwidths/local_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
@@ -24,7 +24,7 @@ fn main() {
         //     *i = num_pes as u8;
         // }
     }
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| *elem = num_pes as u8);
     array.wait_all();
@@ -59,7 +59,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
                 }
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 18fa1078..fe4861f9 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -21,7 +21,7 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    array
+    let _ = array
         .dist_iter_mut()
         .for_each(move |elem| *elem = 255 as u8);
     array.wait_all();
@@ -55,7 +55,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg) };
+                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg) };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
@@ -67,7 +67,6 @@ fn main() {
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == num_pes - 1 {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let array_clone = array.clone();
                 let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
@@ -97,7 +96,7 @@ fn main() {
         );
         }
         bws.push((sum as f64 / 1048576.0) / cur_t);
-        array
+        let _ = array
             .dist_iter_mut()
             .for_each(move |elem| *elem = 255 as u8);
         array.wait_all();
diff --git a/examples/bandwidths/readonly_array_get_bw.rs b/examples/bandwidths/readonly_array_get_bw.rs
index 71b369e4..e3c6d53a 100644
--- a/examples/bandwidths/readonly_array_get_bw.rs
+++ b/examples/bandwidths/readonly_array_get_bw.rs
@@ -19,7 +19,7 @@ fn main() {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
         }
-        array
+        let _ = array
             .dist_iter_mut()
             .for_each(move |elem| *elem = num_pes as u8);
     }
@@ -57,7 +57,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
                 }
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
diff --git a/examples/bandwidths/readonly_array_get_unchecked_bw.rs b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
index ae052f08..dbc52559 100644
--- a/examples/bandwidths/readonly_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
@@ -23,7 +23,7 @@ fn main() {
         // }
     }
     unsafe {
-        array
+        let _ = array
             .local_iter_mut()
             .for_each(move |elem| *elem = num_pes as u8);
     }
diff --git a/examples/bandwidths/task_group_am_bw.rs b/examples/bandwidths/task_group_am_bw.rs
index 04333d8a..41158369 100644
--- a/examples/bandwidths/task_group_am_bw.rs
+++ b/examples/bandwidths/task_group_am_bw.rs
@@ -57,7 +57,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let d = _data.clone();
                 sub_time += sub_timer.elapsed().as_secs_f64();
-                task_group.exec_am_pe(num_pes - 1, DataAM { data: d }); //we explicity  captured _data and transfer it even though we do nothing with it
+                let _ = task_group.exec_am_pe(num_pes - 1, DataAM { data: d }); //we explicity  captured _data and transfer it even though we do nothing with it
 
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/unsafe_array_get_bw.rs b/examples/bandwidths/unsafe_array_get_bw.rs
index 62a724e3..3de3afa4 100644
--- a/examples/bandwidths/unsafe_array_get_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_bw.rs
@@ -20,7 +20,7 @@ fn main() {
             *i = my_pe as u8;
         }
 
-        array
+        let _ = array
             .local_iter_mut()
             .for_each(move |elem| *elem = num_pes as u8);
     }
@@ -57,7 +57,7 @@ fn main() {
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
 
                 unsafe {
-                    array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
                 }
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
diff --git a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
index 634900ef..7985560b 100644
--- a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
@@ -18,7 +18,7 @@ fn main() {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
         }
-        array
+        let _ = array
             .dist_iter_mut()
             .for_each(move |elem| *elem = num_pes as u8);
     }
diff --git a/examples/bandwidths/unsafe_array_put_bw.rs b/examples/bandwidths/unsafe_array_put_bw.rs
index 14431a4a..3463ab4f 100644
--- a/examples/bandwidths/unsafe_array_put_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_bw.rs
@@ -52,7 +52,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, &sub_reg) };
+                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, &sub_reg) };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/unsafe_array_store_bw.rs b/examples/bandwidths/unsafe_array_store_bw.rs
index 3063eaf0..ca6e84c4 100644
--- a/examples/bandwidths/unsafe_array_store_bw.rs
+++ b/examples/bandwidths/unsafe_array_store_bw.rs
@@ -19,7 +19,7 @@ fn main() {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
         }
-        array
+        let _ = array
             .dist_iter_mut()
             .for_each(move |elem| *elem = num_pes as u8);
     }
@@ -56,7 +56,7 @@ fn main() {
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
 
                 // array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
-                unsafe {
+                let _ = unsafe {
                     array.batch_store(ARRAY_LEN * (num_pes - 1), sub_reg.as_slice().unwrap())
                 };
                 sub_time += sub_timer.elapsed().as_secs_f64();
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index 75bc18f3..db50e19b 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -104,8 +104,8 @@ fn main() {
                 my_arc: Darc::new(team.clone(), Arc::new(0)).unwrap(),
             };
             println!("here 7");
-            team.exec_am_pe(0, darc_am.clone());
-            team.exec_am_all(darc_am.clone());
+            let _ = team.exec_am_pe(0, darc_am.clone());
+            let _ = team.exec_am_all(darc_am.clone());
             tg.add_am_pe(0, darc_am.clone());
             tg.add_am_all(darc_am);
             team.block_on(tg.exec());
diff --git a/examples/darc_examples/stress_test.rs b/examples/darc_examples/stress_test.rs
index 169918e5..1c2d153c 100644
--- a/examples/darc_examples/stress_test.rs
+++ b/examples/darc_examples/stress_test.rs
@@ -27,7 +27,7 @@ impl LamellarAM for DataAM {
             for _i in 0..self.width {
                 let pe = pes.sample(&mut rng);
                 // println!("sending {:?} to {:?}",path,pe);
-                lamellar::team.exec_am_pe(
+                let _ = lamellar::team.exec_am_pe(
                     pe,
                     DataAM {
                         darc: self.darc.clone(),
@@ -84,7 +84,7 @@ fn main() {
     let width = 5;
     for _i in 0..width {
         let pe = pes.sample(&mut rng) / 2; //since both teams consist of half the number of pes as the world
-        first_half_team.exec_am_pe(
+        let _ = first_half_team.exec_am_pe(
             pe,
             DataAM {
                 darc: darc.clone(),
@@ -93,7 +93,7 @@ fn main() {
                 path: vec![my_pe],
             },
         );
-        odd_team.exec_am_pe(
+        let _ = odd_team.exec_am_pe(
             pe,
             DataAM {
                 darc: darc.clone(),
diff --git a/examples/hello_world/hello_world_array.rs b/examples/hello_world/hello_world_array.rs
index b8257b57..ff81ea29 100644
--- a/examples/hello_world/hello_world_array.rs
+++ b/examples/hello_world/hello_world_array.rs
@@ -19,7 +19,7 @@ fn main() {
 
     //add 1 to each element of array
     for i in 0..global_length {
-        array.add(i, 1);
+        let _ = array.add(i, 1);
     }
     //wait for all the local add operations to finish
     array.wait_all();
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index f0357a0a..a54e8411 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -148,7 +148,7 @@ fn dft_lamellar(
     let timer = Instant::now();
     for pe in 0..num_pes {
         for k in 0..spectrum_slice.len() {
-            world.exec_am_local(LocalSumAM {
+            let _ = world.exec_am_local(LocalSumAM {
                 spectrum: add_spec.clone(),
                 signal: signal.clone(),
                 global_sig_len: global_sig_len,
@@ -159,7 +159,7 @@ fn dft_lamellar(
         let mut add_spec_vec = vec![0.0; spectrum_slice.len()];
         world.wait_all();
         add_spec_vec.copy_from_slice(unsafe { add_spec.as_slice().unwrap() });
-        world.exec_am_pe(
+        let _ = world.exec_am_pe(
             pe,
             RemoteSumAM {
                 spectrum: spectrum.clone(),
@@ -335,7 +335,7 @@ fn dft_lamellar_array(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f64>) -> f
     let timer = Instant::now();
     let signal_clone = signal.clone();
     unsafe {
-        spectrum
+        let _ = spectrum
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(k, spec_bin)| {
@@ -366,7 +366,7 @@ fn dft_lamellar_array(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f64>) -> f
 fn dft_lamellar_array_2(signal: ReadOnlyArray<f64>, spectrum: AtomicArray<f64>) -> f64 {
     let timer = Instant::now();
     let signal_clone = signal.clone();
-    spectrum
+    let _ = spectrum
         .dist_iter_mut()
         .enumerate()
         .for_each(move |(k, spec_bin)| {
@@ -396,7 +396,7 @@ fn dft_lamellar_array_swapped(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f6
     unsafe {
         for (i, x) in signal.onesided_iter().into_iter().enumerate() {
             let x = (*x).clone();
-            spectrum
+            let _ = spectrum
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(k, spec_bin)| {
@@ -432,7 +432,7 @@ fn dft_lamellar_array_opt(
             .enumerate()
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
-                spectrum
+                let _ = spectrum
                     .dist_iter_mut()
                     .enumerate()
                     .for_each(move |(k, spec_bin)| {
@@ -474,7 +474,7 @@ fn dft_lamellar_array_opt_test(
             .enumerate()
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
-                spectrum.dist_iter_mut().enumerate().for_each_with_schedule(
+                let _ = spectrum.dist_iter_mut().enumerate().for_each_with_schedule(
                     Schedule::Dynamic,
                     move |(k, spec_bin)| {
                         let mut sum = 0f64;
@@ -516,7 +516,7 @@ fn dft_lamellar_array_opt_2(
         .enumerate()
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
-            spectrum
+            let _ = spectrum
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(k, mut spec_bin)| {
@@ -557,7 +557,7 @@ fn dft_lamellar_array_opt_3(
         .enumerate()
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
-            spectrum
+            let _ = spectrum
                 .dist_iter_mut() //this locks the LocalLockArray
                 .enumerate()
                 .for_each(move |(k, spec_bin)| {
@@ -639,7 +639,7 @@ fn main() {
                 *i = rng.gen_range(0.0..1.0);
             }
             let full_signal_clone = full_signal.clone();
-            full_signal_array
+            let _ = full_signal_array
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = full_signal_clone.as_mut_slice().unwrap()[i]);
@@ -756,7 +756,7 @@ fn main() {
 
             //--------------lamellar array--------------------------
             unsafe {
-                full_spectrum_array
+                let _ = full_spectrum_array
                     .dist_iter_mut()
                     .for_each(|elem| *elem = 0.0);
             }
@@ -804,7 +804,7 @@ fn main() {
 
             //------------optimized lamellar array----------------
             unsafe {
-                full_spectrum_array
+                let _ = full_spectrum_array
                     .dist_iter_mut()
                     .for_each(|elem| *elem = 0.0);
             }
@@ -823,7 +823,7 @@ fn main() {
 
             //--------------lamellar array--------------------------
             unsafe {
-                full_spectrum_array
+                let _ = full_spectrum_array
                     .dist_iter_mut()
                     .for_each(|elem| *elem = 0.0);
             }
@@ -871,7 +871,7 @@ fn main() {
             // ));
 
             world.barrier();
-            full_spectrum_array
+            let _ = full_spectrum_array
                 .dist_iter_mut()
                 .for_each(|elem| elem.store(0.0));
             full_spectrum_array.wait_all();
@@ -898,7 +898,7 @@ fn main() {
             ));
 
             world.barrier();
-            full_spectrum_array
+            let _ = full_spectrum_array
                 .dist_iter_mut()
                 .for_each(|elem| *elem = 0.0);
             full_spectrum_array.wait_all();
diff --git a/examples/kernels/parallel_array_gemm.rs b/examples/kernels/parallel_array_gemm.rs
index 00d640fe..8c871af6 100644
--- a/examples/kernels/parallel_array_gemm.rs
+++ b/examples/kernels/parallel_array_gemm.rs
@@ -33,10 +33,11 @@ fn main() {
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
 
     //initialize matrices
-    a.dist_iter_mut()
+    let _ = a
+        .dist_iter_mut()
         .enumerate()
         .for_each(|(i, x)| *x = i as f32);
-    b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
+    let _ = b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
         //need global index so use dist_iter
         //identity matrix
         let row = i / dim;
@@ -47,7 +48,7 @@ fn main() {
             *x = 0 as f32;
         }
     });
-    c.dist_iter_mut().for_each(|x| x.store(0.0));
+    let _ = c.dist_iter_mut().for_each(|x| x.store(0.0));
 
     world.wait_all();
     world.barrier();
@@ -67,7 +68,8 @@ fn main() {
         .for_each(|(j, col)| {
             let col = col.clone();
             let c = c.clone();
-            a.local_iter() //LocalIterator (each pe will iterate through only its local data -- in parallel)
+            let _ = a
+                .local_iter() //LocalIterator (each pe will iterate through only its local data -- in parallel)
                 .chunks(n) // chunk by the row size
                 .enumerate()
                 .for_each(move |(i, row)| {
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index f0a30a25..53dc9aa0 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -39,10 +39,11 @@ fn main() {
     let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block); //col major
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
                                                                          //initialize
-    a.dist_iter_mut()
+    let _ = a
+        .dist_iter_mut()
         .enumerate()
         .for_each(|(i, x)| *x = i as f32);
-    b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
+    let _ = b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
         //identity matrix
         let row = i / dim;
         let col = i % dim;
@@ -52,7 +53,7 @@ fn main() {
             *x = 0 as f32;
         }
     });
-    c.dist_iter_mut().for_each(|x| x.store(0.0));
+    let _ = c.dist_iter_mut().for_each(|x| x.store(0.0));
     world.wait_all();
     world.barrier();
     let a = a.into_read_only();
@@ -88,7 +89,7 @@ fn main() {
     let a = a.clone();
     let b = b.clone();
     let c_clone = c.clone();
-    nblks_array.dist_iter().for_each(move |block| {
+    let _ = nblks_array.dist_iter().for_each(move |block| {
         //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
         // for j_blk in 0..p_blks {
         // iterate over submatrix rows of b
diff --git a/examples/kernels/serial_array_gemm.rs b/examples/kernels/serial_array_gemm.rs
index 301658fe..912a0a46 100644
--- a/examples/kernels/serial_array_gemm.rs
+++ b/examples/kernels/serial_array_gemm.rs
@@ -28,10 +28,11 @@ fn main() {
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
                                                                          //initialize matrices
 
-    a.dist_iter_mut()
+    let _ = a
+        .dist_iter_mut()
         .enumerate()
         .for_each(|(i, x)| *x = i as f32);
-    b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
+    let _ = b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
         //identity matrix
         let row = i / dim;
         let col = i % dim;
@@ -41,7 +42,7 @@ fn main() {
             *x = 0 as f32;
         }
     });
-    c.dist_iter_mut().for_each(|x| x.store(0.0));
+    let _ = c.dist_iter_mut().for_each(|x| x.store(0.0));
 
     world.wait_all();
     world.barrier();
@@ -73,7 +74,7 @@ fn main() {
                         let b_val = b_c.at(j + k * n);
                         sum += a_val.await * b_val.await;
                     }
-                    c_c.store(j + i * m, sum); // could also do c.add(j+i*m,sum), but each element of c will only be updated once so store is slightly faster
+                    let _ = c_c.store(j + i * m, sum); // could also do c.add(j+i*m,sum), but each element of c will only be updated once so store is slightly faster
                 }
             }
         });
diff --git a/examples/rdma_examples/rdma_am.rs b/examples/rdma_examples/rdma_am.rs
index bfbd0aac..6f36f619 100644
--- a/examples/rdma_examples/rdma_am.rs
+++ b/examples/rdma_examples/rdma_am.rs
@@ -144,7 +144,7 @@ fn main() {
     world.barrier();
     let mut index = 0;
     while index * num_pes < ARRAY_LEN {
-        world.exec_am_all(RdmaLocalMRAM {
+        let _ = world.exec_am_all(RdmaLocalMRAM {
             array: local_array.clone(),
             orig_pe: my_pe,
             index: index,
diff --git a/examples/team_examples/custom_team_arch.rs b/examples/team_examples/custom_team_arch.rs
index d94fb2a9..6879b99e 100644
--- a/examples/team_examples/custom_team_arch.rs
+++ b/examples/team_examples/custom_team_arch.rs
@@ -134,7 +134,7 @@ fn test_team(world: &LamellarWorld, team: Option<Arc<LamellarTeam>>, label: &str
             1
         };
         let timer = Instant::now();
-        team.exec_am_all(TeamAM { secs }); //everynode that has a handle can launch on a given team;
+        let _ = team.exec_am_all(TeamAM { secs }); //everynode that has a handle can launch on a given team;
         team.wait_all(); //wait until all requests return
         team.barrier(); // barriers only apply to team members, its a no op for non team members
         timer.elapsed().as_secs_f64()
@@ -168,7 +168,7 @@ fn main() {
     }
     world.barrier();
     let timer = Instant::now();
-    world.exec_am_all(TeamAM { secs: 1 });
+    let _ = world.exec_am_all(TeamAM { secs: 1 });
     world.wait_all();
     world.barrier();
     let elapsed = timer.elapsed().as_secs_f64();
@@ -213,6 +213,6 @@ fn main() {
         3,                                      //block size
         (num_pes as f64 / 3.0).ceil() as usize, //num pes in team
     );
-        let blk_stride_team = world.create_team_from_arch(arch);
+    let blk_stride_team = world.create_team_from_arch(arch);
     test_team(&world, blk_stride_team, "blk stride team");
 }
diff --git a/examples/team_examples/random_team.rs b/examples/team_examples/random_team.rs
index 0051585f..08481088 100644
--- a/examples/team_examples/random_team.rs
+++ b/examples/team_examples/random_team.rs
@@ -177,7 +177,7 @@ fn main() {
                 team_pe: t,
             };
             println!("launching {:?} to pe {:?}", d, i);
-            team.exec_am_pe(i, d);
+            let _ = team.exec_am_pe(i, d);
         }
 
         let p = rand_arch.team_id(my_pe);
diff --git a/examples/team_examples/team_am.rs b/examples/team_examples/team_am.rs
index 85a3ec13..5bd3bde5 100644
--- a/examples/team_examples/team_am.rs
+++ b/examples/team_examples/team_am.rs
@@ -38,7 +38,7 @@ fn test_team(world: &LamellarWorld, team: Option<Arc<LamellarTeam>>, label: &str
             1
         };
         let timer = Instant::now();
-        team.exec_am_all(TeamAM {
+        let _ = team.exec_am_all(TeamAM {
             secs: secs,
             orig_pe: my_pe,
         }); //everynode that has a handle can launch on a given team;
@@ -76,7 +76,7 @@ fn main() {
     }
     world.barrier();
     let timer = Instant::now();
-    world.exec_am_all(TeamAM {
+    let _ = world.exec_am_all(TeamAM {
         secs: 1,
         orig_pe: my_pe,
     });
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index afc8ab6c..0559dc29 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -1177,13 +1177,7 @@ pub trait ActiveMessaging {
 
 #[async_trait]
 pub(crate) trait ActiveMessageEngine {
-    async fn process_msg(
-        self,
-        am: Am,
-        scheduler: Arc<Executor>,
-        stall_mark: usize,
-        immediate: bool,
-    );
+    async fn process_msg(self, am: Am, stall_mark: usize, immediate: bool);
 
     async fn exec_msg(
         self,
diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index bfb099c7..37d59640 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -86,7 +86,7 @@ impl Batcher for SimpleBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("remote batch_id {batch_id} created {dst:?}");
+            // println!("remote batch_id {batch_id} created ");
             let cur_stall_mark = self.stall_mark.clone();
             // println!(
             //     "[{:?}] add_remote_am_to_batch submit task",
@@ -104,7 +104,7 @@ impl Batcher for SimpleBatcher {
                 SimpleBatcher::create_tx_task(batch).await;
             }
         } else if size >= MAX_BATCH_SIZE {
-            // println!("remote size: {:?} {dst:?}",size);
+            // println!("remote size: {:?} ", size);
             // println!(
             //     "[{:?}] add_remote_am_to_batch submit imm task",
             //     std::thread::current().id()
@@ -236,7 +236,7 @@ impl Batcher for SimpleBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("unit batch_id {batch_id} created {dst:?}");
+            // println!("unit batch_id {batch_id} created ");
             let cur_stall_mark = self.stall_mark.clone();
             // println!(
             //     "[{:?}] add_unit_am_to_batch submit task",
@@ -254,7 +254,7 @@ impl Batcher for SimpleBatcher {
                 SimpleBatcher::create_tx_task(batch).await;
             }
         } else if size >= MAX_BATCH_SIZE {
-            // println!("unit size: {:?} {dst:?}",size);
+            // println!("unit size: {:?} ", size);
             // println!(
             //     "[{:?}] add_unit_am_to_batch submit imm task",
             //     std::thread::current().id()
@@ -288,7 +288,9 @@ impl Batcher for SimpleBatcher {
                 }
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
-                Cmd::BatchedMsg => panic!("should not recieve a batched msg within a batched msg"),
+                Cmd::BatchedMsg => {
+                    panic!("should not recieve a batched msg within a Simple Batcher batched msg")
+                }
             }
         }
         return_ams
@@ -312,6 +314,7 @@ impl SimpleBatcher {
     #[tracing::instrument(skip_all)]
     async fn create_tx_task(batch: SimpleBatcherInner) {
         // println!("[{:?}] create_tx_task", std::thread::current().id());
+        async_std::task::yield_now().await; // force this to renter the task queue so other requests can hopefully come in before sending the batch
         let (buf, size) = batch.swap();
 
         if size > 0 {
@@ -533,7 +536,10 @@ impl SimpleBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        // println!("[{:?}] exec_am submit task", std::thread::current().id());
+        // println!(
+        //     "[{:?}] simple batcher exec_am submit task",
+        //     std::thread::current().id()
+        // );
         let am = match am
             .exec(
                 team.team.world_pe,
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index 60473bb7..d92d5595 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -88,8 +88,9 @@ impl TeamAmBatcherInner {
             .or_insert_with(|| HashMap::new());
         if team_batch.len() == 0 {
             temp_size += *TEAM_HEADER_LEN;
-            //println!(
-            //     "adding team header {} {} {}",
+            // println!(
+            //     "[{:?}] adding team header {} {} {}",
+            //     std::thread::current().id(),
             //     temp_size,
             //     *TEAM_HEADER_LEN,
             //     self.size.load(Ordering::SeqCst)
@@ -98,8 +99,9 @@ impl TeamAmBatcherInner {
         let am_batch = team_batch.entry(id).or_insert_with(|| Vec::new());
         if am_batch.len() == 0 {
             temp_size += *BATCHED_AM_HEADER_LEN;
-            //println!(
-            //     "adding batched header {} {} {}",
+            // println!(
+            //     "[{:?}] adding batched header {} {} {}",
+            //     std::thread::current().id(),
             //     temp_size,
             //     *BATCHED_AM_HEADER_LEN,
             //     self.size.load(Ordering::SeqCst)
@@ -107,8 +109,9 @@ impl TeamAmBatcherInner {
         }
         am_batch.push((req_data, am, size));
         temp_size += size + *REQ_ID_LEN;
-        //println!(
-        //     "adding req_id + size header {} {} {} {}",
+        // println!(
+        //     "[{:?}] adding req_id + size header {} {} {} {}",
+        //     std::thread::current().id(),
         //     temp_size,
         //     *REQ_ID_LEN,
         //     size,
@@ -184,6 +187,7 @@ impl Batcher for TeamAmBatcher {
         am_size: usize,
         mut stall_mark: usize,
     ) {
+        // println!("[{:?}] add_remote_am_to_batch", std::thread::current().id());
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -195,7 +199,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("remote batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] remote batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -216,7 +223,11 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("remote size: {:?}",size);
+            // println!(
+            //     "[{:?}] remote size: {:?}",
+            //     std::thread::current().id(),
+            //     size
+            // );
             TeamAmBatcher::create_tx_task(
                 batch,
                 req_data.lamellae.clone(),
@@ -236,6 +247,7 @@ impl Batcher for TeamAmBatcher {
         am_size: usize,
         mut stall_mark: usize,
     ) {
+        // println!("[{:?}] add_return_am_to_batch", std::thread::current().id(),);
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -247,7 +259,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("return batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] return batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -268,7 +283,11 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("return size: {:?}",size);
+            // println!(
+            //     "[{:?}] return size: {:?}",
+            //     std::thread::current().id(),
+            //     size
+            // );
 
             TeamAmBatcher::create_tx_task(
                 batch,
@@ -288,6 +307,7 @@ impl Batcher for TeamAmBatcher {
         data_size: usize,
         mut stall_mark: usize,
     ) {
+        // println!("[{:?}] add_data_am_to_batch", std::thread::current().id(),);
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -306,7 +326,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("data batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] data batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -327,7 +350,7 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("data size: {:?}",size);
+            // println!("[{:?}] data size: {:?}", std::thread::current().id(), size);
             TeamAmBatcher::create_tx_task(
                 batch,
                 req_data.lamellae.clone(),
@@ -340,6 +363,7 @@ impl Batcher for TeamAmBatcher {
 
     #[tracing::instrument(skip_all)]
     async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
+        // println!("[{:?}] add_unit_am_to_batch", std::thread::current().id(),);
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -351,7 +375,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("unit batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] unit batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -372,7 +399,7 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("unit size: {:?}",size);
+            // println!("[{:?}] unit size: {:?}", std::thread::current().id(), size);
             TeamAmBatcher::create_tx_task(
                 batch,
                 req_data.lamellae.clone(),
@@ -391,15 +418,18 @@ impl Batcher for TeamAmBatcher {
         lamellae: Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
     ) -> Vec<Am> {
+        // println!("[{:?}] exec_batched_msg", std::thread::current().id());
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("i: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
+        let mut return_ams = Vec::new();
         while i < data.len() {
             // println!("\ti: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
             let batch: BatchHeader =
                 crate::deserialize(&data[i..i + *BATCH_HEADER_LEN], false).unwrap();
             // println!("batch {:?} i: {} len: {}", batch, i, data.len());
             i += *BATCH_HEADER_LEN;
+            // println!("[{:?}] cmd {:?}", std::thread::current().id(), batch.cmd);
             match batch.cmd {
                 Cmd::Am | Cmd::ReturnAm => {
                     panic!("should not encounter individual am cmds in TeamAmBatcher")
@@ -407,12 +437,15 @@ impl Batcher for TeamAmBatcher {
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => {
-                    self.exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
-                        .await;
+                    return_ams.append(
+                        &mut self
+                            .exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
+                            .await,
+                    );
                 }
             }
         }
-        Vec::new()
+        return_ams
     }
 }
 
@@ -436,6 +469,8 @@ impl TeamAmBatcher {
         arch: Arc<LamellarArchRT>,
         my_pe: usize,
     ) {
+        // println!("[{:?}] create_tx_task", std::thread::current().id());
+        async_std::task::yield_now().await; // force this to renter the task queue so other requests can hopefully come in before sending the batch
         let (am_batch, return_am_batch, non_am_batch, mut size) = batch.swap();
         if size > 0 {
             if am_batch.len() > 0 {
@@ -448,7 +483,11 @@ impl TeamAmBatcher {
             let data_buf = TeamAmBatcher::create_data_buf(header, size, &lamellae).await;
             let data_slice = data_buf.data_as_bytes();
 
-            //println!("total batch size: {}", size);
+            // println!(
+            //     "[{:?}] total batch size: {}",
+            //     std::thread::current().id(),
+            //     size
+            // );
             let mut i = 0;
             TeamAmBatcher::serialize_am_batch(am_batch, data_slice, &mut i, Cmd::Am);
             TeamAmBatcher::serialize_am_batch(return_am_batch, data_slice, &mut i, Cmd::ReturnAm);
@@ -683,6 +722,7 @@ impl TeamAmBatcher {
         ame: &RegisteredActiveMessages,
     ) -> Vec<Am> {
         let mut return_ams = Vec::new();
+        // println!("exec_batched_am batch_cnt: {}", batch_cnt);
         for _team in 0..batch_cnt {
             let team_header: TeamHeader =
                 crate::deserialize(&data[*i..*i + *TEAM_HEADER_LEN], false).unwrap();
@@ -697,7 +737,11 @@ impl TeamAmBatcher {
                 // println!("batched am header: {:?}", batched_am_header);
                 *i += *BATCHED_AM_HEADER_LEN;
                 for _am in 0..batched_am_header.am_cnt {
-                    // println!("am cmd: {:?}", batched_am_header.cmd);
+                    // println!(
+                    //     "[{:?}] am cmd: {:?}",
+                    //     std::thread::current().id(),
+                    //     batched_am_header.cmd
+                    // );
                     match batched_am_header.cmd {
                         Cmd::Am => return_ams.push(
                             self.exec_am(
@@ -729,6 +773,11 @@ impl TeamAmBatcher {
                 }
             }
         }
+        // println!(
+        //     "[{:?}] return_ams: {:?}",
+        //     std::thread::current().id(),
+        //     return_ams
+        // );
         return_ams
     }
 
@@ -747,6 +796,7 @@ impl TeamAmBatcher {
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
         *i += am.serialized_size();
+        // println!("Team Batcher exec am");
 
         let req_data = ReqMetaData {
             src: team.team.world_pe,
@@ -791,6 +841,7 @@ impl TeamAmBatcher {
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
     ) {
+        // println!("[{:?}] exec_return_am", std::thread::current().id());
         let req_id = crate::deserialize(&data[*i..*i + *REQ_ID_LEN], false).unwrap();
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index fce885ec..63378b14 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -96,16 +96,10 @@ pub(crate) struct UnitHeader {
 }
 
 #[async_trait]
-impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
+impl ActiveMessageEngine for RegisteredActiveMessages {
     #[tracing::instrument(skip_all)]
-    async fn process_msg(
-        self,
-        am: Am,
-        executor: Arc<Executor>,
-        stall_mark: usize,
-        immediate: bool,
-    ) {
-        // println!("[{:?}] {am:?}", std::thread::current().id());
+    async fn process_msg(self, am: Am, stall_mark: usize, immediate: bool) {
+        // println!("[{:?}] process_msg {am:?}", std::thread::current().id());
         match am {
             Am::All(req_data, am) => {
                 // println!("{:?}",am.get_id());
@@ -127,6 +121,12 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                             )
                             .await;
                     } else {
+                        // println!(
+                        //     "[{:?}] {:?} all {:?}",
+                        //     std::thread::current().id(),
+                        //     am_id,
+                        //     am_size
+                        // );
                         self.send_am(req_data.clone(), am.clone(), am_id, am_size, Cmd::Am)
                             .await;
                     }
@@ -134,8 +134,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 let world = LamellarTeam::new(None, req_data.world.clone(), true);
                 let team = LamellarTeam::new(Some(world.clone()), req_data.team.clone(), true);
                 if req_data.team.arch.team_pe(req_data.src).is_ok() {
-                    self.clone()
-                        .exec_local_am(req_data, am.as_local(), world, team)
+                    self.exec_local_am(req_data, am.as_local(), world, team)
                         .await;
                 }
             }
@@ -143,8 +142,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 if req_data.dst == Some(req_data.src) {
                     let world = LamellarTeam::new(None, req_data.world.clone(), true);
                     let team = LamellarTeam::new(Some(world.clone()), req_data.team.clone(), true);
-                    self.clone()
-                        .exec_local_am(req_data, am.as_local(), world, team)
+                    self.exec_local_am(req_data, am.as_local(), world, team)
                         .await;
                 } else {
                     let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
@@ -154,6 +152,12 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                             .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                             .await;
                     } else {
+                        // println!(
+                        //     "[{:?}] {:?} pe {:?}",
+                        //     std::thread::current().id(),
+                        //     am_id,
+                        //     am_size
+                        // );
                         self.send_am(req_data, am, am_id, am_size, Cmd::Am).await;
                     }
                 }
@@ -161,7 +165,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
             Am::Local(req_data, am) => {
                 let world = LamellarTeam::new(None, req_data.world.clone(), true);
                 let team = LamellarTeam::new(Some(world.clone()), req_data.team.clone(), true);
-                self.clone().exec_local_am(req_data, am, world, team).await;
+                self.exec_local_am(req_data, am, world, team).await;
             }
             Am::Return(req_data, am) => {
                 // println!("Am::Return");
@@ -172,6 +176,12 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                         .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                         .await;
                 } else {
+                    // println!(
+                    //     "[{:?}] {:?} return {:?}",
+                    //     std::thread::current().id(),
+                    //     am_id,
+                    //     am_size
+                    // );
                     self.send_am(req_data, am, am_id, am_size, Cmd::ReturnAm)
                         .await;
                 }
@@ -184,6 +194,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                         .add_data_am_to_batch(req_data, data, data_size, stall_mark)
                         .await;
                 } else {
+                    // println!("[{:?}] data {:?}", std::thread::current().id(), data_size);
                     self.send_data_am(req_data, data, data_size).await;
                 }
             }
@@ -193,6 +204,11 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                         .add_unit_am_to_batch(req_data, stall_mark)
                         .await;
                 } else {
+                    // println!(
+                    //     "[{:?}]  unit {:?}",
+                    //     std::thread::current().id(),
+                    //     *UNIT_HEADER_LEN
+                    // );
                     self.send_unit_am(req_data).await;
                 }
             }
@@ -207,13 +223,13 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
         lamellae: Arc<Lamellae>,
         executor: Arc<Executor>,
     ) {
-        // println!("exec_msg");
+        // println!("[{:?}] exec_msg {:?}", std::thread::current().id(), msg.cmd);
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         match msg.cmd {
             Cmd::Am => {
                 let return_am = self.exec_am(&msg, data, &mut i, &lamellae).await;
-                let process_task = self.process_msg(return_am, executor.clone(), 0, false);
+                let process_task = self.process_msg(return_am, 0, false);
                 executor.submit_task(process_task);
             }
             Cmd::ReturnAm => {
@@ -232,7 +248,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                     .await;
                 let am_tasks = futures::stream::FuturesUnordered::new();
                 for am in ams.into_iter() {
-                    am_tasks.push(self.clone().process_msg(am, executor.clone(), 0, false));
+                    am_tasks.push(self.clone().process_msg(am, 0, false));
                 }
                 executor.submit_task(futures::future::join_all(am_tasks));
             }
@@ -248,7 +264,7 @@ impl RegisteredActiveMessages {
 
     #[tracing::instrument(skip_all)]
     async fn send_am(
-        self: &Arc<Self>,
+        &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
@@ -294,12 +310,7 @@ impl RegisteredActiveMessages {
     }
 
     #[tracing::instrument(skip_all)]
-    async fn send_data_am(
-        self: &Arc<Self>,
-        req_data: ReqMetaData,
-        data: LamellarResultArc,
-        data_size: usize,
-    ) {
+    async fn send_data_am(&self, req_data: ReqMetaData, data: LamellarResultArc, data_size: usize) {
         // println!("send_data_am");
         let header = self.create_header(&req_data, Cmd::Data);
         let mut darcs = vec![];
@@ -334,7 +345,7 @@ impl RegisteredActiveMessages {
     }
 
     #[tracing::instrument(skip_all)]
-    async fn send_unit_am(self: &Arc<Self>, req_data: ReqMetaData) {
+    async fn send_unit_am(&self, req_data: ReqMetaData) {
         // println!("send_unit_am");
 
         let header = self.create_header(&req_data, Cmd::Unit);
@@ -354,7 +365,7 @@ impl RegisteredActiveMessages {
     }
 
     #[tracing::instrument(skip_all)]
-    fn create_header(self: &Arc<Self>, req_data: &ReqMetaData, cmd: Cmd) -> SerializeHeader {
+    fn create_header(&self, req_data: &ReqMetaData, cmd: Cmd) -> SerializeHeader {
         let msg = Msg {
             src: req_data.team.world_pe as u16,
             cmd: cmd,
@@ -364,7 +375,7 @@ impl RegisteredActiveMessages {
 
     #[tracing::instrument(skip_all)]
     async fn create_data_buf(
-        self: &Arc<Self>,
+        &self,
         header: SerializeHeader,
         size: usize,
         lamellae: &Arc<Lamellae>,
@@ -389,7 +400,7 @@ impl RegisteredActiveMessages {
     #[async_recursion]
     #[tracing::instrument(skip_all)]
     pub(crate) async fn exec_local_am(
-        self: Arc<Self>,
+        &self,
         req_data: ReqMetaData,
         am: LamellarArcLocalAm,
         world: Arc<LamellarTeam>,
@@ -407,7 +418,7 @@ impl RegisteredActiveMessages {
             .await
         {
             LamellarReturn::LocalData(data) => {
-                // println!("local am data return");
+                // println!("[{:?}] local am data return", std::thread::current().id());
                 self.send_data_to_user_handle(
                     req_data.id,
                     req_data.src,
@@ -415,13 +426,12 @@ impl RegisteredActiveMessages {
                 );
             }
             LamellarReturn::LocalAm(am) => {
-                // println!("local am am return");
-                self.clone()
-                    .exec_local_am(req_data, am.as_local(), world, team)
+                // println!("[{:?}] local am am return", std::thread::current().id());
+                self.exec_local_am(req_data, am.as_local(), world, team)
                     .await;
             }
             LamellarReturn::Unit => {
-                // println!("local am unit return");
+                // println!("[{:?}] local am unit return", std::thread::current().id());
                 self.send_data_to_user_handle(req_data.id, req_data.src, InternalResult::Unit);
             }
             LamellarReturn::RemoteData(_) | LamellarReturn::RemoteAm(_) => {
@@ -432,7 +442,7 @@ impl RegisteredActiveMessages {
 
     #[tracing::instrument(skip_all)]
     pub(crate) async fn exec_am(
-        self: &Arc<Self>,
+        &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
@@ -481,7 +491,7 @@ impl RegisteredActiveMessages {
 
     #[tracing::instrument(skip_all)]
     pub(crate) async fn exec_return_am(
-        self: &Arc<Self>,
+        &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
@@ -505,14 +515,13 @@ impl RegisteredActiveMessages {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        self.clone()
-            .exec_local_am(req_data, am.as_local(), world, team)
+        self.exec_local_am(req_data, am.as_local(), world, team)
             .await;
     }
 
     #[tracing::instrument(skip_all)]
     pub(crate) async fn exec_data_am(
-        self: &Arc<Self>,
+        &self,
         msg: &Msg,
         data_buf: &[u8],
         i: &mut usize,
@@ -538,7 +547,7 @@ impl RegisteredActiveMessages {
     }
 
     #[tracing::instrument(skip_all)]
-    pub(crate) async fn exec_unit_am(self: &Arc<Self>, msg: &Msg, data: &[u8], i: &mut usize) {
+    pub(crate) async fn exec_unit_am(&self, msg: &Msg, data: &[u8], i: &mut usize) {
         // println!("exec_unit_am");
         let unit_header: UnitHeader =
             crate::deserialize(&data[*i..*i + *UNIT_HEADER_LEN], false).unwrap();
diff --git a/src/array.rs b/src/array.rs
index 386a2aaa..7d1809fe 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -428,6 +428,12 @@ impl<T: Clone> TeamFrom<(&Vec<T>, Distribution)> for Vec<T> {
     }
 }
 
+impl<T: Clone> TeamFrom<(Vec<T>, Distribution)> for Vec<T> {
+    fn team_from(vals: (Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        vals.0.to_vec()
+    }
+}
+
 impl<T: Dist> TeamTryFrom<&T> for LamellarArrayRdmaInput<T> {
     fn team_try_from(val: &T, team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::team_from(val, team))
@@ -503,6 +509,20 @@ impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
     }
 }
 
+// #[async_trait]
+// impl<T: Clone> AsyncTeamFrom<(&Vec<T>, Distribution)> for Vec<T> {
+//     async fn team_from(vals: (&Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         vals.0.to_vec()
+//     }
+// }
+
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for Vec<T> {
+    async fn team_from(input: (Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        input.0
+    }
+}
+
 #[async_trait]
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 /// and to be used within an async context
@@ -557,6 +577,7 @@ pub trait TeamFrom<T: ?Sized> {
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 /// and to be used within an async context
 pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> {
+    /// Converts to this type from the input type
     async fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
 }
 
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index d41fdf1f..9a56e3e6 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -36,19 +36,16 @@ use take::*;
 
 pub(crate) use consumer::*;
 
-use crate::array::iterator::one_sided_iterator::OneSidedIterator;
-use crate::array::iterator::{IterRequest, Schedule};
+use crate::array::iterator::Schedule;
 use crate::array::{
     operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray,
-    LamellarArray, LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
+    LamellarArray, NativeAtomicArray,
 };
-use crate::lamellar_request::LamellarRequest;
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
 use crate::active_messaging::SyncSend;
 
-use async_trait::async_trait;
 use enum_dispatch::enum_dispatch;
 use futures::Future;
 use std::marker::PhantomData;
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 31486893..b97d62a0 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
 use crate::array::iterator::IterRequest;
 use crate::array::operations::ArrayOps;
-use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index df16f948..72a5068f 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
 use crate::array::iterator::IterRequest;
 use crate::array::operations::ArrayOps;
-use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 2369c5e2..d818e467 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -272,76 +272,6 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         // println!("after buffered ops");
         // array.inner.data.print();
     }
-
-    async fn async_new<U: Into<IntoLamellarTeam>>(
-        team: U,
-        array_size: usize,
-        distribution: Distribution,
-    ) -> UnsafeArray<T> {
-        let team = team.into().team.clone();
-        team.async_barrier().await;
-        let task_group = LamellarTaskGroup::new(team.clone());
-        let my_pe = team.team_pe_id().unwrap();
-        let num_pes = team.num_pes();
-        let full_array_size = std::cmp::max(array_size, num_pes);
-
-        let elem_per_pe = full_array_size as f64 / num_pes as f64;
-        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
-                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
-        let rmr = MemoryRegion::new(
-            per_pe_size * std::mem::size_of::<T>(),
-            team.lamellae.clone(),
-            AllocationType::Global,
-        );
-        unsafe {
-            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
-                *elem = 0;
-            }
-        }
-
-        let data = Darc::try_new_with_drop(
-            team.clone(),
-            UnsafeArrayData {
-                mem_region: rmr,
-                array_counters: Arc::new(AMCounters::new()),
-                team: team.clone(),
-                task_group: Arc::new(task_group),
-                my_pe: my_pe,
-                num_pes: num_pes,
-                req_cnt: Arc::new(AtomicUsize::new(0)),
-            },
-            crate::darc::DarcMode::UnsafeArray,
-            None,
-        )
-        .expect("trying to create array on non team member");
-        let array = UnsafeArray {
-            inner: UnsafeArrayInner {
-                data: data,
-                distribution: distribution.clone(),
-                // wait: wait,
-                orig_elem_per_pe: elem_per_pe,
-                elem_size: std::mem::size_of::<T>(),
-                offset: 0,             //relative to size of T
-                size: full_array_size, //relative to size of T
-            },
-            phantom: PhantomData,
-        };
-        // println!("new unsafe");
-        // unsafe {println!("size {:?} bytes {:?}",array.inner.size, array.inner.data.mem_region.as_mut_slice().unwrap().len())};
-        // println!("elem per pe {:?}", elem_per_pe);
-        // for i in 0..num_pes{
-        //     println!("pe: {:?} {:?}",i,array.inner.num_elems_pe(i));
-        // }
-        // array.inner.data.print();
-        if full_array_size != array_size {
-            println!("WARNING: Array size {array_size} is less than number of pes {full_array_size}, each PE will not contain data");
-            array.sub_array(0..array_size)
-        } else {
-            array
-        }
-        // println!("after buffered ops");
-        // array.inner.data.print();
-    }
 }
 impl<T: Dist + 'static> UnsafeArray<T> {
     #[doc(alias("One-sided", "onesided"))]
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 4a9668a0..ac562be4 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, AsyncTeamFrom, AsyncTeamInto, Distribution, LamellarArray, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, LamellarArray};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 80d27ed4..15970336 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -395,7 +395,6 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let index_vec = index.to_vec();
             let the_array: UnsafeArray<T> = self.clone();
             // println!("num_reqs {:?}",num_reqs);
-            let the_array: UnsafeArray<T> = self.clone();
             self.inner
                 .data
                 .team
diff --git a/src/darc.rs b/src/darc.rs
index 9f715137..36f7acf7 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1263,46 +1263,6 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
         let block_on_fut =
             { DarcInner::block_on_outstanding(wrapped.clone(), DarcMode::Dropped, 0) };
         block_on_fut.await;
-        // wrapped.wait_all();
-        // // let inner = unsafe {&*wrapped.inner}; //we dont actually care about the "type" we wrap here, we just need access to the meta data for the darc (but still allow async wait cause T is not send)
-        // while wrapped.dist_cnt.load(Ordering::SeqCst) != 0
-        //     || wrapped.local_cnt.load(Ordering::SeqCst) != 0
-        // {
-        //     if wrapped.local_cnt.load(Ordering::SeqCst) == 0 {
-        //         wrapped.send_finished();
-        //     }
-        //     if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
-        //         let ref_cnts_slice = std::slice::from_raw_parts_mut(
-        //             wrapped.ref_cnt_addr as *mut usize,
-        //             wrapped.num_pes,
-        //         );
-
-        //         println!("[WARNING] - Potential deadlock detected when trying to free distributed object.\n\
-        //             The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
-        //             The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
-        //             the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
-        //             To view backtrace set RUST_LIB_BACKTRACE=1\n\
-        //             {}",
-        //             mode_refs,
-        //             wrapped.local_cnt.load(Ordering::SeqCst),
-        //             wrapped.dist_cnt.load(Ordering::SeqCst),
-        //             *crate::DEADLOCK_TIMEOUT,
-        //             std::backtrace::Backtrace::capture()
-        //         );
-        //         timeout = std::time::Instant::now();
-        //     }
-        //     async_std::task::yield_now().await;
-        // }
-        // // let team = wrapped.team();
-        // let rdma = &self.team.lamellae;
-        // for pe in self.team.arch.team_iter() {
-        //     // println!("putting {:?} to {:?} @ {:x}",&mode_refs[self.my_pe..=self.my_pe],pe,self.mode_addr + self.my_pe * std::mem::size_of::<u8>());
-        //     rdma.put(
-        //         pe,
-        //         &mode_refs_u8[self.my_pe..=self.my_pe],
-        //         self.mode_addr + self.my_pe * std::mem::size_of::<DarcMode>(),
-        //     );
-        // }
 
         // println!(
         //     "[{:?}] past block_on_outstanding {:x}",
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 81a6b317..8e23df41 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -82,6 +82,7 @@ impl LamellarRequestResult {
         }
 
         self.req.update_counters();
+
         added
     }
 }
@@ -133,7 +134,8 @@ impl LamellarRequestAddResult for LamellarRequestHandleInner {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         // println!(
-        //     "update counter team {} world {}",
+        //     "[{:?}] update counter team {} world {}",
+        //     std::thread::current().id(),
         //     _team_reqs - 1,
         //     _world_req - 1
         // );
@@ -255,7 +257,12 @@ impl LamellarRequestAddResult for LamellarMultiRequestHandleInner {
         // );
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!("update counter team {} world {}",_team_reqs-1,_world_req-1);
+        // println!(
+        //     "[{:?}] multi update counter team {} world {}",
+        //     std::thread::current().id(),
+        //     _team_reqs - 1,
+        //     _world_req - 1
+        // );
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         }
@@ -395,7 +402,12 @@ impl LamellarRequestAddResult for LamellarLocalRequestHandleInner {
     fn update_counters(&self) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!("update counter team {} world {}",_team_reqs-1,_world_req-1);
+        // println!(
+        //     "[{:?}] local update counter team {} world {}",
+        //     std::thread::current().id(),
+        //     _team_reqs - 1,
+        //     _world_req - 1
+        // );
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         }
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index f3d7726d..4c00646a 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -370,27 +370,38 @@ impl LamellarWorldBuilder {
     pub fn new() -> LamellarWorldBuilder {
         // simple_logger::init().unwrap();
         // trace!("New world builder");
-        let mut executor = match std::env::var("LAMELLAR_EXECUTOR") {
+        let executor = match std::env::var("LAMELLAR_EXECUTOR") {
             Ok(val) => {
                 let executor = val.parse::<usize>().unwrap();
                 if executor == 0 {
                     ExecutorType::LamellarWorkStealing
+                } else if executor == 1 {
+                    #[cfg(feature = "tokio-executor")]
+                    {
+                        ExecutorType::Tokio
+                    }
+                    #[cfg(not(feature = "tokio-executor"))]
+                    {
+                        println!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled,  defaulting to lamellar work stealing executor");
+                        ExecutorType::LamellarWorkStealing
+                    }
+                } else {
+                    println!("[LAMELLAR WARNING]: invalid executor selected defaulting to lamellar work stealing executor");
+                    ExecutorType::LamellarWorkStealing
                 }
-                // else if scheduler == 1 {
-                //     ExecutorType::NumaWorkStealing
-                // } else if scheduler == 2 {
-                //     ExecutorType::NumaWorkStealing2
-                // }
-                else {
+            }
+            Err(_) => {
+                #[cfg(feature = "tokio-executor")]
+                {
+                    ExecutorType::Tokio
+                }
+                #[cfg(not(feature = "tokio-executor"))]
+                {
                     ExecutorType::LamellarWorkStealing
                 }
             }
-            Err(_) => ExecutorType::LamellarWorkStealing,
         };
-        #[cfg(feature = "tokio-executor")]
-        {
-            executor = ExecutorType::Tokio;
-        }
+        println!("executor: {:?}", executor);
 
         let num_threads = match std::env::var("LAMELLAR_THREADS") {
             Ok(n) => {
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 0ba82e24..72e422d5 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -51,10 +51,16 @@ pub(crate) struct ReqId {
     pub(crate) sub_id: usize,
 }
 
+/// Indicates the executor backend
+/// Default is a work stealing executor
+/// If the "tokio-executor" feature is enabled,the tokio executor can also be used
+/// allowing seemless integration with tokio based applications
 #[derive(Debug)]
 pub enum ExecutorType {
+    /// The default work stealing executor
     LamellarWorkStealing,
     #[cfg(feature = "tokio-executor")]
+    /// The tokio executor
     Tokio,
     // Dyn(impl LamellarExecutor),
 }
@@ -132,13 +138,18 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
-        let executor = self.executor.clone();
         let am_future = async move {
             num_ams.fetch_add(1, Ordering::Relaxed);
-            max_ams.fetch_add(1, Ordering::Relaxed);
+            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, executor, am_stall_mark, false).await;
+            // println!("[{:?}] submit_am {:?}", std::thread::current().id(), am_id);
+            ame.process_msg(am, am_stall_mark, false).await;
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] submit_am_done {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
         };
         self.executor.submit_task(am_future);
@@ -150,13 +161,22 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
-        let executor = self.executor.clone();
         let am_future = async move {
             num_ams.fetch_add(1, Ordering::Relaxed);
-            max_ams.fetch_add(1, Ordering::Relaxed);
+            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, executor, am_stall_mark, false).await;
+            // println!(
+            //     "[{:?}] submit_am_immediate {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
+            ame.process_msg(am, am_stall_mark, false).await;
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] submit_am_immediate done {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
         };
         self.executor.submit_immediate_task(am_future);
@@ -169,8 +189,13 @@ impl Scheduler {
         let executor = self.executor.clone();
         let am_future = async move {
             num_ams.fetch_add(1, Ordering::Relaxed);
-            max_ams.fetch_add(1, Ordering::Relaxed);
+            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            // println!(
+            //     "[{:?}] submit_remote_am {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             if let Some(header) = data.deserialize_header() {
                 let msg = header.msg;
                 ame.exec_msg(msg, data, lamellae, executor).await;
@@ -179,6 +204,11 @@ impl Scheduler {
                 panic!("should i be here?");
             }
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] submit_remote_am done {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
         };
         self.executor.submit_task(am_future);
@@ -192,9 +222,19 @@ impl Scheduler {
         let max_tasks = self.max_tasks.clone();
         let future = async move {
             num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
+            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] execing new task {:?}",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
             task.await;
             num_tasks.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] done new task {:?} ",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
         };
         self.executor.submit_task(future);
     }
@@ -207,9 +247,19 @@ impl Scheduler {
         let max_tasks = self.max_tasks.clone();
         let future = async move {
             num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
+            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] execing new task immediate {:?}",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
             task.await;
             num_tasks.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] done new task immediate {:?} ",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
         };
         self.executor.submit_immediate_task(future);
     }
diff --git a/src/scheduler/numa_work_stealing.rs b/src/scheduler/numa_work_stealing.rs
deleted file mode 100644
index c2f5a043..00000000
--- a/src/scheduler/numa_work_stealing.rs
+++ /dev/null
@@ -1,552 +0,0 @@
-use crate::active_messaging::{ActiveMessageEngine, ActiveMessageEngineType, Am};
-use crate::lamellae::{Des, Lamellae, SerializedData};
-use crate::scheduler::batching::simple_batcher::SimpleBatcher;
-use crate::scheduler::batching::team_am_batcher::TeamAmBatcher;
-use crate::scheduler::batching::BatcherType;
-use crate::scheduler::registered_active_message::RegisteredActiveMessages;
-use crate::scheduler::{AmeScheduler, AmeSchedulerQueue, SchedulerQueue};
-// use log::trace;
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-// use parking_lot::RwLock;
-use rand::prelude::*;
-// use std::collections::HashMap;
-use std::collections::HashMap;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-use thread_local::ThreadLocal;
-// use std::time::Instant;
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealingThread {
-    node_work_inj: Arc<crossbeam::deque::Injector<async_task::Runnable>>,
-    _sys_work_inj: Vec<Arc<crossbeam::deque::Injector<async_task::Runnable>>>,
-    node_work_stealers: Vec<crossbeam::deque::Stealer<async_task::Runnable>>,
-    _sys_work_stealers: HashMap<usize, Vec<crossbeam::deque::Stealer<async_task::Runnable>>>,
-    work_q: Worker<async_task::Runnable>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-}
-
-impl NumaWorkStealingThread {
-    fn run(
-        worker: NumaWorkStealingThread,
-        active_cnt: Arc<AtomicUsize>,
-        num_tasks: Arc<AtomicUsize>,
-        id: CoreId,
-    ) -> thread::JoinHandle<()> {
-        thread::spawn(move || {
-            // println!("TestSchdulerWorker thread running");
-            core_affinity::set_for_current(id);
-            active_cnt.fetch_add(1, Ordering::SeqCst);
-            let mut rng = rand::thread_rng();
-            let t = rand::distributions::Uniform::from(0..worker.node_work_stealers.len());
-            let mut timer = std::time::Instant::now();
-            // let mut cur_tasks = num_tasks.load(Ordering::SeqCst);
-            while worker.active.load(Ordering::SeqCst)
-                || !(worker.work_q.is_empty() && worker.node_work_inj.is_empty())
-                || num_tasks.load(Ordering::SeqCst) > 1
-            {
-                // let ot = Instant::now();
-                // if cur_tasks != num_tasks.load(Ordering::SeqCst){
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     cur_tasks = num_tasks.load(Ordering::SeqCst);
-
-                // }
-                let omsg = worker.work_q.pop().or_else(|| {
-                    if worker
-                        .work_flag
-                        .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-                        == Ok(0)
-                    {
-                        let ret = worker
-                            .node_work_inj
-                            .steal_batch_and_pop(&worker.work_q)
-                            .success();
-                        worker.work_flag.store(0, Ordering::SeqCst);
-                        ret
-                    } else {
-                        worker.node_work_stealers[t.sample(&mut rng)]
-                            .steal()
-                            .success()
-                    }
-                });
-                if let Some(runnable) = omsg {
-                    if !worker.active.load(Ordering::SeqCst) && timer.elapsed().as_secs_f64() > 60.0
-                    {
-                        println!("runnable {:?}", runnable);
-                        println!(
-                            "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                            worker.work_q.len(),
-                            worker.node_work_inj.len(),
-                            num_tasks.load(Ordering::SeqCst)
-                        );
-                        timer = std::time::Instant::now();
-                    }
-                    runnable.run();
-                }
-                if !worker.active.load(Ordering::SeqCst)
-                    && timer.elapsed().as_secs_f64() > 60.0
-                    && (worker.work_q.len() > 0 || worker.node_work_inj.len() > 0)
-                {
-                    println!(
-                        "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                        worker.work_q.len(),
-                        worker.node_work_inj.len(),
-                        num_tasks.load(Ordering::SeqCst)
-                    );
-                    timer = std::time::Instant::now();
-                }
-                // if timer.elapsed().as_secs_f64() > 60.0 {
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.node_work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     timer = std::time::Instant::now()
-                // }
-            }
-            active_cnt.fetch_sub(1, Ordering::SeqCst);
-            // println!("TestSchdulerWorker thread shutting down");
-        })
-    }
-}
-
-/*
-create a work injector and stealer for each numa node,
-additionally create a threadlocal counter that each thread will use to index
-into the to appropriate work injector when submitting work
-*/
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealingInner {
-    threads: Vec<thread::JoinHandle<()>>,
-    work_inj: Vec<Arc<crossbeam::deque::Injector<async_task::Runnable>>>,
-    work_stealers: HashMap<usize, Vec<crossbeam::deque::Stealer<async_task::Runnable>>>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-    active_cnt: Arc<AtomicUsize>,
-    num_tasks: Arc<AtomicUsize>,
-    stall_mark: Arc<AtomicUsize>,
-    local_work_inj: ThreadLocal<AtomicUsize>,
-    nodes: Vec<usize>,
-}
-
-impl AmeSchedulerQueue for NumaWorkStealingInner {
-    fn submit_am(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        let num_tasks = self.num_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = async move {
-            // println!("exec req {:?}",num_tasks.load(Ordering::Relaxed));
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("in submit_req {:?} {:?} {:?} ", pe.clone(), req_data.src, req_data.pe);
-            ame.process_msg(am, scheduler, stall_mark).await;
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done req {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    //this is a serialized request
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        data: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ) {
-        // let work_inj = self.work_inj.clone();
-        // println!("submit work {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future = async move {
-            // println!("exec work {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            if let Some(header) = data.deserialize_header() {
-                let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, scheduler).await;
-            } else {
-                data.print();
-                panic!("should i be here?");
-            }
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done work {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        // println!("submit task {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future2 = async move {
-            // println!("exec task {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done task {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn block_on<F: Future>(&self, future: F) -> F::Output {
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, mut task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let waker = runnable.waker();
-        runnable.schedule();
-        while !task.is_finished() {
-            self.exec_task();
-        }
-        let cx = &mut async_std::task::Context::from_waker(&waker);
-        if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-            output
-        } else {
-            panic!("task not ready");
-        }
-    }
-
-    fn shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(false, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
-        while self.active_cnt.load(Ordering::Relaxed) > 2
-            || self.num_tasks.load(Ordering::Relaxed) > 2
-        {
-            //this should be the recvtask, and alloc_task
-            std::thread::yield_now()
-        }
-        // println!(
-        //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
-        //     self.active_cnt.load(Ordering::Relaxed),
-        //     self.active_cnt.load(Ordering::Relaxed)
-        // );
-    }
-
-    fn exec_task(&self) {
-        let mut rng = rand::thread_rng();
-        // let c = rand::distributions::Uniform::from(0..self.work_stealers.len());
-        // let c = rand::distributions::Uniform::from
-        let ret = if self
-            .work_flag
-            .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-            == Ok(0)
-        {
-            let ret = self
-                .nodes
-                .choose_multiple(&mut rng, self.nodes.len())
-                .find_map(|node| self.work_inj[*node % self.nodes.len()].steal().success());
-            self.work_flag.store(0, Ordering::SeqCst);
-            ret
-        } else {
-            self.nodes
-                .choose_multiple(&mut rng, self.nodes.len())
-                .find_map(|node| {
-                    self.work_stealers[node]
-                        .choose(&mut rng)
-                        .unwrap()
-                        .steal()
-                        .success()
-                })
-        };
-        if let Some(runnable) = ret {
-            runnable.run();
-        }
-    }
-
-    fn active(&self) -> bool {
-        // println!("sched active {:?} {:?}",self.active.load(Ordering::SeqCst) , self.num_tasks.load(Ordering::SeqCst));
-        self.active.load(Ordering::SeqCst) || self.num_tasks.load(Ordering::SeqCst) > 2
-    }
-}
-
-impl SchedulerQueue for NumaWorkStealing {
-    fn submit_am(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner.submit_am(self, self.ame.clone(), am);
-    }
-
-    // fn submit_return(&self, src, pe)
-
-    fn submit_work(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
-        self.inner
-            .submit_work(self, self.ame.clone(), data, lamellae);
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn exec_task(&self) {
-        self.inner.exec_task();
-    }
-
-    fn submit_task_node<F>(&self, future: F, _node: usize)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        self.inner.block_on(future)
-    }
-
-    fn shutdown(&self) {
-        self.inner.shutdown();
-    }
-    fn active(&self) -> bool {
-        self.inner.active()
-    }
-}
-
-impl NumaWorkStealingInner {
-    pub(crate) fn new(stall_mark: Arc<AtomicUsize>) -> NumaWorkStealingInner {
-        // println!("new work stealing queue");
-
-        let mut sched = NumaWorkStealingInner {
-            threads: Vec::new(),
-            work_inj: Vec::new(), //Arc::new(crossbeam::deque::Injector::new()),
-            work_stealers: HashMap::new(), //Vec::new(),
-            work_flag: Arc::new(AtomicU8::new(0)),
-            active: Arc::new(AtomicBool::new(true)),
-            active_cnt: Arc::new(AtomicUsize::new(0)),
-            num_tasks: Arc::new(AtomicUsize::new(0)),
-            stall_mark: stall_mark,
-            local_work_inj: ThreadLocal::new(),
-            nodes: Vec::new(),
-        };
-        sched.local_work_inj.get_or(|| AtomicUsize::new(0));
-        sched.init();
-        sched
-    }
-
-    fn init(&mut self) {
-        let num_workers = match std::env::var("LAMELLAR_THREADS") {
-            Ok(n) => n.parse::<usize>().unwrap() - 1,
-            Err(_) => 4,
-        };
-        let core_ids = core_affinity::get_core_ids().unwrap();
-        println!("core_ids: {:?}", core_ids);
-        let mut node_to_cores: HashMap<usize, Vec<usize>> = HashMap::new();
-        let mut core_to_node: HashMap<usize, usize> = HashMap::new();
-
-        let mut cur_worker_cnt = 0;
-
-        if let Ok(nodes) = glob::glob("/sys/devices/system/node/node*") {
-            for node in nodes {
-                if let Ok(node_path) = node {
-                    if let Some(node) = format!("{}", node_path.display()).split("/").last() {
-                        if let Some(node) = node.strip_prefix("node") {
-                            if let Ok(node) = node.parse::<usize>() {
-                                if let Ok(cpus) =
-                                    glob::glob(&format!("{}/cpu*", node_path.display()))
-                                {
-                                    let mut cores = Vec::new();
-                                    for cpu in cpus {
-                                        if let Ok(cpu) = cpu {
-                                            if let Some(cpu) =
-                                                format!("{}", cpu.display()).split("/").last()
-                                            {
-                                                if let Some(cpu) = cpu.strip_prefix("cpu") {
-                                                    if let Ok(cpu) = cpu.parse::<usize>() {
-                                                        for core_id in core_ids.iter() {
-                                                            if core_id.id == cpu {
-                                                                core_to_node.insert(cpu, node);
-                                                                cores.push(cpu);
-                                                                cur_worker_cnt += 1;
-                                                            }
-                                                            if cur_worker_cnt >= num_workers {
-                                                                break;
-                                                            }
-                                                        }
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                    if cores.len() > 0 {
-                                        node_to_cores.insert(node, cores);
-                                    }
-                                    if cur_worker_cnt >= num_workers {
-                                        break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        println!("node_to_cores {:?}", node_to_cores);
-        println!("core_to_node {:?}", core_to_node);
-
-        let mut work_workers = HashMap::new();
-        for (node, cores) in &node_to_cores {
-            let mut node_work_workers: std::vec::Vec<
-                crossbeam::deque::Worker<async_task::Runnable>,
-            > = vec![];
-            let mut node_work_stealers = vec![];
-            for _core in cores {
-                let core_work_worker: crossbeam::deque::Worker<async_task::Runnable> =
-                    crossbeam::deque::Worker::new_fifo();
-                node_work_stealers.push(core_work_worker.stealer());
-                node_work_workers.push(core_work_worker);
-            }
-            self.work_inj
-                .push(Arc::new(crossbeam::deque::Injector::new()));
-            self.work_stealers.insert(*node, node_work_stealers);
-            work_workers.insert(node, node_work_workers);
-            self.nodes.push(*node);
-        }
-
-        let orig_hook = panic::take_hook();
-        panic::set_hook(Box::new(move |panic_info| {
-            // invoke the default handler and exit the process
-            orig_hook(panic_info);
-            process::exit(1);
-        }));
-
-        let mut inj = 0;
-        for (node, cores) in &node_to_cores {
-            let node_work_workers = work_workers.get_mut(&node).unwrap();
-            for core in cores {
-                let core_work_worker = node_work_workers.pop().unwrap();
-                let worker = NumaWorkStealingThread {
-                    node_work_inj: self.work_inj[inj].clone(),
-                    _sys_work_inj: self.work_inj.clone(),
-                    node_work_stealers: self.work_stealers.get(&node).unwrap().clone(),
-                    _sys_work_stealers: self.work_stealers.clone(),
-                    work_q: core_work_worker,
-                    work_flag: self.work_flag.clone(),
-                    active: self.active.clone(),
-                };
-                self.threads.push(NumaWorkStealingThread::run(
-                    worker,
-                    self.active_cnt.clone(),
-                    self.num_tasks.clone(),
-                    CoreId { id: *core },
-                ));
-            }
-            inj += 1;
-        }
-
-        while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
-            std::thread::yield_now();
-        }
-    }
-}
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing {
-    inner: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-    ame: Arc<ActiveMessageEngineType>,
-}
-impl NumaWorkStealing {
-    pub(crate) fn new(
-        num_pes: usize,
-        // my_pe: usize,
-        // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
-    ) -> NumaWorkStealing {
-        // println!("new work stealing queue");
-        let stall_mark = Arc::new(AtomicUsize::new(0));
-        let inner = Arc::new(AmeScheduler::NumaWorkStealingInner(
-            NumaWorkStealingInner::new(stall_mark.clone()),
-        ));
-        let batcher = match std::env::var("LAMELLAR_BATCHER") {
-            Ok(n) => {
-                let n = n.parse::<usize>().unwrap();
-                if n == 1 {
-                    BatcherType::Simple(SimpleBatcher::new(num_pes, stall_mark.clone()))
-                } else {
-                    BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone()))
-                }
-            }
-            Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone())),
-        };
-        let sched = NumaWorkStealing {
-            inner: inner.clone(),
-            ame: Arc::new(ActiveMessageEngineType::RegisteredActiveMessages(
-                RegisteredActiveMessages::new(batcher),
-            )),
-        };
-        sched
-    }
-}
-
-impl Drop for NumaWorkStealingInner {
-    //when is this called with respect to world?
-    fn drop(&mut self) {
-        // println!("dropping work stealing");
-        while let Some(thread) = self.threads.pop() {
-            if thread.thread().id() != std::thread::current().id() {
-                let _res = thread.join();
-            }
-        }
-        for val in self.local_work_inj.iter_mut() {
-            println!("local_work_inj {:?}", val.load(Ordering::SeqCst));
-        }
-        // println!("NumaWorkStealing Scheduler Dropped");
-    }
-}
diff --git a/src/scheduler/numa_work_stealing2.rs b/src/scheduler/numa_work_stealing2.rs
deleted file mode 100644
index ec82c3ef..00000000
--- a/src/scheduler/numa_work_stealing2.rs
+++ /dev/null
@@ -1,569 +0,0 @@
-use crate::active_messaging::{ActiveMessageEngine, ActiveMessageEngineType, Am};
-use crate::lamellae::{Des, Lamellae, SerializedData};
-use crate::scheduler::batching::simple_batcher::SimpleBatcher;
-use crate::scheduler::batching::team_am_batcher::TeamAmBatcher;
-use crate::scheduler::batching::BatcherType;
-use crate::scheduler::registered_active_message::RegisteredActiveMessages;
-use crate::scheduler::{AmeScheduler, AmeSchedulerQueue, SchedulerQueue};
-// use log::trace;
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-// use parking_lot::RwLock;
-use rand::prelude::*;
-use std::collections::HashMap;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-// use thread_local::ThreadLocal;
-// use std::time::Instant;
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing2Thread {
-    work_inj: Arc<crossbeam::deque::Injector<async_task::Runnable>>,
-    work_stealers: Vec<crossbeam::deque::Stealer<async_task::Runnable>>,
-    work_q: Worker<async_task::Runnable>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-}
-
-impl NumaWorkStealing2Thread {
-    fn run(
-        worker: NumaWorkStealing2Thread,
-        active_cnt: Arc<AtomicUsize>,
-        num_tasks: Arc<AtomicUsize>,
-        id: CoreId,
-    ) -> thread::JoinHandle<()> {
-        thread::spawn(move || {
-            // println!("TestSchdulerWorker thread running");
-            core_affinity::set_for_current(id);
-            active_cnt.fetch_add(1, Ordering::SeqCst);
-            let mut rng = rand::thread_rng();
-            let t = rand::distributions::Uniform::from(0..worker.work_stealers.len());
-            let mut timer = std::time::Instant::now();
-            // let mut cur_tasks = num_tasks.load(Ordering::SeqCst);
-            while worker.active.load(Ordering::SeqCst)
-                || !(worker.work_q.is_empty() && worker.work_inj.is_empty())
-                || num_tasks.load(Ordering::SeqCst) > 1
-            {
-                // let ot = Instant::now();
-                // if cur_tasks != num_tasks.load(Ordering::SeqCst){
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     cur_tasks = num_tasks.load(Ordering::SeqCst);
-
-                // }
-                let omsg = worker.work_q.pop().or_else(|| {
-                    if worker
-                        .work_flag
-                        .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-                        == Ok(0)
-                    {
-                        let ret = worker
-                            .work_inj
-                            .steal_batch_and_pop(&worker.work_q)
-                            .success();
-                        worker.work_flag.store(0, Ordering::SeqCst);
-                        ret
-                    } else {
-                        worker.work_stealers[t.sample(&mut rng)].steal().success()
-                    }
-                });
-                if let Some(runnable) = omsg {
-                    if !worker.active.load(Ordering::SeqCst) && timer.elapsed().as_secs_f64() > 60.0
-                    {
-                        println!("runnable {:?}", runnable);
-                        println!(
-                            "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                            worker.work_q.len(),
-                            worker.work_inj.len(),
-                            num_tasks.load(Ordering::SeqCst)
-                        );
-                        timer = std::time::Instant::now();
-                    }
-                    runnable.run();
-                }
-                if !worker.active.load(Ordering::SeqCst)
-                    && timer.elapsed().as_secs_f64() > 60.0
-                    && (worker.work_q.len() > 0 || worker.work_inj.len() > 0)
-                {
-                    println!(
-                        "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                        worker.work_q.len(),
-                        worker.work_inj.len(),
-                        num_tasks.load(Ordering::SeqCst)
-                    );
-                    timer = std::time::Instant::now();
-                }
-                // if timer.elapsed().as_secs_f64() > 60.0 {
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     timer = std::time::Instant::now()
-                // }
-            }
-            active_cnt.fetch_sub(1, Ordering::SeqCst);
-            // println!("TestSchdulerWorker thread shutting down");
-        })
-    }
-}
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing2Inner {
-    threads: Vec<thread::JoinHandle<()>>,
-    work_inj: Arc<crossbeam::deque::Injector<async_task::Runnable>>,
-    work_stealers: Vec<crossbeam::deque::Stealer<async_task::Runnable>>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-    active_cnt: Arc<AtomicUsize>,
-    num_tasks: Arc<AtomicUsize>,
-    stall_mark: Arc<AtomicUsize>,
-}
-
-impl AmeSchedulerQueue for NumaWorkStealing2Inner {
-    fn submit_am(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        let num_tasks = self.num_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = async move {
-            // println!("exec req {:?}",num_tasks.load(Ordering::Relaxed));
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("in submit_req {:?} {:?} {:?} ", pe.clone(), req_data.src, req_data.pe);
-            ame.process_msg(am, scheduler, stall_mark).await;
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done req {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    //this is a serialized request
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        data: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ) {
-        // let work_inj = self.work_inj.clone();
-        // println!("submit work {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future = async move {
-            // println!("exec work {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            if let Some(header) = data.deserialize_header() {
-                let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, scheduler).await;
-            } else {
-                data.print();
-                panic!("should i be here?");
-            }
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done work {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        // println!("submit task {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future2 = async move {
-            // println!("exec task {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done task {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, mut task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let waker = runnable.waker();
-        runnable.schedule();
-        while !task.is_finished() {
-            self.exec_task();
-        }
-        let cx = &mut async_std::task::Context::from_waker(&waker);
-        if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-            output
-        } else {
-            panic!("task not ready");
-        }
-    }
-
-    fn shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(false, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
-        while self.active_cnt.load(Ordering::Relaxed) > 2
-            || self.num_tasks.load(Ordering::Relaxed) > 2
-        {
-            //this should be the recvtask, and alloc_task
-            std::thread::yield_now()
-        }
-        // println!(
-        //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
-        //     self.active_cnt.load(Ordering::Relaxed),
-        //     self.active_cnt.load(Ordering::Relaxed)
-        // );
-    }
-
-    fn exec_task(&self) {
-        let mut rng = rand::thread_rng();
-        let t = rand::distributions::Uniform::from(0..self.work_stealers.len());
-        let ret = if self
-            .work_flag
-            .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-            == Ok(0)
-        {
-            let ret = self.work_inj.steal().success();
-            self.work_flag.store(0, Ordering::SeqCst);
-            ret
-        } else {
-            self.work_stealers[t.sample(&mut rng)].steal().success()
-        };
-        if let Some(runnable) = ret {
-            runnable.run();
-        }
-    }
-
-    fn active(&self) -> bool {
-        // println!("sched active {:?} {:?}",self.active.load(Ordering::SeqCst) , self.num_tasks.load(Ordering::SeqCst));
-        self.active.load(Ordering::SeqCst) || self.num_tasks.load(Ordering::SeqCst) > 2
-    }
-}
-
-impl SchedulerQueue for NumaWorkStealing2 {
-    fn submit_am(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-
-        self.inners[node].submit_am(self, self.ames[node].clone(), am);
-    }
-
-    // fn submit_return(&self, src, pe)
-
-    fn submit_work(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
-        // let node = if let Some(header) = data.deserialize_header() {
-        //     let msg = header.msg;
-        //     if let ExecType::Am(cmd) = msg.cmd.clone() {
-        //         match cmd {
-        //             Cmd::BatchedDataReturn | Cmd::BatchedAmReturn => {
-        //                 println!(
-        //                     "got batched return {:x} {:x}",
-        //                     msg.req_id.id,
-        //                     msg.req_id.id & self.node_mask
-        //                 );
-        //                 msg.req_id.id & self.node_mask
-        //             }
-        //             _ => CUR_NODE
-        //                 .with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask),
-        //         }
-        //     } else {
-        //         CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask)
-        //     }
-        // } else {
-        //     CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask)
-        // };
-        // println!("submit work {:?}", node);
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].submit_work(self, self.ames[node].clone(), data, lamellae);
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].submit_task(future);
-    }
-
-    fn exec_task(&self) {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].exec_task();
-    }
-
-    fn submit_task_node<F>(&self, future: F, node: usize)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inners[node].submit_task(future);
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].block_on(future)
-    }
-
-    fn shutdown(&self) {
-        for inner in self.inners.iter() {
-            inner.shutdown();
-        }
-    }
-    fn active(&self) -> bool {
-        for inner in self.inners.iter() {
-            if inner.active() {
-                return true;
-            }
-        }
-        return false;
-    }
-}
-
-impl NumaWorkStealing2Inner {
-    pub(crate) fn new(
-        stall_mark: Arc<AtomicUsize>,
-        core_ids: Vec<CoreId>,
-    ) -> NumaWorkStealing2Inner {
-        // println!("new work stealing queue");
-
-        let mut sched = NumaWorkStealing2Inner {
-            threads: Vec::new(),
-            work_inj: Arc::new(crossbeam::deque::Injector::new()),
-            work_stealers: Vec::new(),
-            work_flag: Arc::new(AtomicU8::new(0)),
-            active: Arc::new(AtomicBool::new(true)),
-            active_cnt: Arc::new(AtomicUsize::new(0)),
-            num_tasks: Arc::new(AtomicUsize::new(0)),
-            stall_mark: stall_mark,
-        };
-        sched.init(core_ids);
-        sched
-    }
-
-    fn init(&mut self, core_ids: Vec<CoreId>) {
-        let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<async_task::Runnable>> =
-            vec![];
-        // let num_workers = match std::env::var("LAMELLAR_THREADS") {
-        //     Ok(n) => n.parse::<usize>().unwrap(),
-        //     Err(_) => 4,
-        // };
-        for _i in 0..core_ids.len() {
-            let work_worker: crossbeam::deque::Worker<async_task::Runnable> =
-                crossbeam::deque::Worker::new_fifo();
-            self.work_stealers.push(work_worker.stealer());
-            work_workers.push(work_worker);
-        }
-
-        let orig_hook = panic::take_hook();
-        panic::set_hook(Box::new(move |panic_info| {
-            // invoke the default handler and exit the process
-            orig_hook(panic_info);
-            process::exit(1);
-        }));
-        // let core_ids = core_affinity::get_core_ids().unwrap();
-        // println!("core_ids: {:?}",core_ids);
-        for i in 0..core_ids.len() {
-            let work_worker = work_workers.pop().unwrap();
-            let worker = NumaWorkStealing2Thread {
-                work_inj: self.work_inj.clone(),
-                work_stealers: self.work_stealers.clone(),
-                work_q: work_worker,
-                work_flag: self.work_flag.clone(),
-                active: self.active.clone(),
-                // num_tasks: self.num_tasks.clone(),
-            };
-            self.threads.push(NumaWorkStealing2Thread::run(
-                worker,
-                self.active_cnt.clone(),
-                self.num_tasks.clone(),
-                core_ids[i % core_ids.len()],
-            ));
-        }
-        while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
-            std::thread::yield_now();
-        }
-    }
-}
-
-thread_local! {
-    static CUR_NODE: AtomicUsize = AtomicUsize::new(0);
-}
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing2 {
-    inners: Vec<&(impl SchedulerQueue + Sync + std::fmt::Debug)>,
-    ames: Vec<Arc<ActiveMessageEngineType>>,
-    node_mask: usize,
-}
-impl NumaWorkStealing2 {
-    pub(crate) fn new(
-        num_pes: usize,
-        // my_pe: usize,
-        // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
-    ) -> NumaWorkStealing2 {
-        // println!("new work stealing queue");
-
-        let num_workers = match std::env::var("LAMELLAR_THREADS") {
-            Ok(n) => n.parse::<usize>().unwrap(),
-            Err(_) => 4,
-        };
-        let core_ids = core_affinity::get_core_ids().unwrap();
-        println!("core_ids: {:?}", core_ids);
-        let mut node_to_cores: HashMap<usize, Vec<usize>> = HashMap::new();
-        let mut core_to_node: HashMap<usize, usize> = HashMap::new();
-
-        let mut cur_worker_cnt = 0;
-
-        if let Ok(nodes) = glob::glob("/sys/devices/system/node/node*") {
-            for node in nodes {
-                if let Ok(node_path) = node {
-                    if let Some(node) = format!("{}", node_path.display()).split("/").last() {
-                        if let Some(node) = node.strip_prefix("node") {
-                            if let Ok(node) = node.parse::<usize>() {
-                                if let Ok(cpus) =
-                                    glob::glob(&format!("{}/cpu*", node_path.display()))
-                                {
-                                    let mut cores = Vec::new();
-                                    for cpu in cpus {
-                                        if let Ok(cpu) = cpu {
-                                            if let Some(cpu) =
-                                                format!("{}", cpu.display()).split("/").last()
-                                            {
-                                                if let Some(cpu) = cpu.strip_prefix("cpu") {
-                                                    if let Ok(cpu) = cpu.parse::<usize>() {
-                                                        for core_id in core_ids.iter() {
-                                                            if core_id.id == cpu {
-                                                                core_to_node.insert(cpu, node);
-                                                                cores.push(cpu);
-                                                                cur_worker_cnt += 1;
-                                                            }
-                                                            if cur_worker_cnt >= num_workers {
-                                                                break;
-                                                            }
-                                                        }
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                    if cores.len() > 0 {
-                                        node_to_cores.insert(node, cores);
-                                    }
-                                    if cur_worker_cnt >= num_workers {
-                                        break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        println!("node_to_cores {:?}", node_to_cores);
-        println!("core_to_node {:?}", core_to_node);
-
-        let mut inners = vec![];
-        let mut ames = vec![];
-
-        let mut node_mask = node_to_cores.len() - 1;
-        node_mask |= node_mask >> 1;
-        node_mask |= node_mask >> 2;
-        node_mask |= node_mask >> 4;
-        node_mask |= node_mask >> 8;
-        node_mask |= node_mask >> 16;
-        node_mask |= node_mask >> 32;
-
-        // let mut node_i = 0;
-        let stall_mark = Arc::new(AtomicUsize::new(0));
-        for (_node, cores) in node_to_cores.iter() {
-            let mut core_ids = vec![];
-            for core in cores {
-                core_ids.push(CoreId { id: *core });
-            }
-            let inner = Arc::new(AmeScheduler::NumaWorkStealing2Inner(
-                NumaWorkStealing2Inner::new(stall_mark.clone(), core_ids),
-            ));
-            let batcher = match std::env::var("LAMELLAR_BATCHER") {
-                Ok(n) => {
-                    let n = n.parse::<usize>().unwrap();
-                    if n == 1 {
-                        BatcherType::Simple(SimpleBatcher::new(num_pes, stall_mark.clone()))
-                    } else {
-                        BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone()))
-                    }
-                }
-                Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone())),
-            };
-            ames.push(Arc::new(ActiveMessageEngineType::RegisteredActiveMessages(
-                RegisteredActiveMessages::new(batcher),
-            )));
-            inners.push(inner);
-            // node_i += 1;
-        }
-
-        println!("numa node mask: {:x}", node_mask);
-
-        let sched = NumaWorkStealing2 {
-            inners: inners,
-            ames: ames,
-            node_mask: node_mask,
-        };
-        sched
-    }
-}
-
-impl Drop for NumaWorkStealing2Inner {
-    //when is this called with respect to world?
-    fn drop(&mut self) {
-        // println!("dropping work stealing");
-        while let Some(thread) = self.threads.pop() {
-            if thread.thread().id() != std::thread::current().id() {
-                let _res = thread.join();
-            }
-        }
-        // for val in self.local_work_inj.iter_mut() {
-        //     println!("local_work_inj {:?}", val.load(Ordering::SeqCst));
-        // }
-        // println!("NumaWorkStealing2 Scheduler Dropped");
-    }
-}
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index f9e14ac1..becd7611 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -1,22 +1,10 @@
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+use crate::scheduler::LamellarExecutor;
 
 use tokio::runtime::Runtime;
 
 use tracing::*;
 
-use async_task::{Builder, Runnable};
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
 use futures::Future;
-use futures_lite::FutureExt;
-use rand::prelude::*;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-
-static TASK_ID: AtomicUsize = AtomicUsize::new(0);
 
 #[derive(Debug)]
 pub(crate) struct TokioRt {
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 5bf47967..9f73175a 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -7,25 +7,23 @@ use rand::seq::SliceRandom;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         #[allow(unused_unsafe)]
-        unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val)
-        };
+        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
         // println!("----------------------------------------------");
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -92,7 +90,7 @@ macro_rules! add_test{
 
             for idx in 0..array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                    let _ = array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
                 }
             }
             array.wait_all();
@@ -121,7 +119,7 @@ macro_rules! add_test{
             indices.shuffle(&mut rng);
             for idx in indices.iter() {//0..num_updates{
                 // let idx = rand_idx.sample(&mut rng);
-                array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                let _ = array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
             }
             array.wait_all();
             array.barrier();
@@ -156,7 +154,7 @@ macro_rules! add_test{
             sub_array.barrier();
             for idx in 0..sub_array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                    let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
                 }
             }
             sub_array.wait_all();
@@ -182,7 +180,7 @@ macro_rules! add_test{
             indices.shuffle(&mut rng);
             for idx in indices.iter(){ // in 0..num_updates{
                 // let idx = rand_idx.sample(&mut rng);
-                sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -218,7 +216,7 @@ macro_rules! add_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(pe_max_val as usize){
-                        sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                        let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
                     }
                 }
                 sub_array.wait_all();
@@ -244,7 +242,7 @@ macro_rules! add_test{
                 indices.shuffle(&mut rng);
                 for idx in indices.iter() {//0..num_updates{
                     // let idx = rand_idx.sample(&mut rng);
-                    sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                    let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
@@ -319,10 +317,10 @@ macro_rules! input_test{
             #[allow(unused_unsafe)]
             unsafe {
                 if $dist == lamellar::array::Distribution::Block{
-                    input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {println!("i: {:?}",i);*x = i%array_total_len});
+                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {println!("i: {:?}",i);*x = i%array_total_len});
                 }
                 else{
-                    input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {println!("i: {:?}",i);*x = i/num_pes});
+                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {println!("i: {:?}",i);*x = i/num_pes});
                 }
             }
             input_array.wait_all();
@@ -330,51 +328,51 @@ macro_rules! input_test{
             input_array.print();
             //individual T------------------------------
             for i in 0..array.len(){
-                array.batch_add(i,1);
+                let _ = array.batch_add(i,1);
             }
             check_results!($array,array,num_pes,"T");
             println!("passed T");
             //individual T------------------------------
             for i in 0..array.len(){
-                array.batch_add(&i,1);
+                let _ = array.batch_add(&i,1);
             }
             check_results!($array,array,num_pes,"&T");
             println!("passed &T");
             //&[T]------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
             let slice = &vec[..];
-            array.batch_add(slice,1);
+            let _ = array.batch_add(slice,1);
             check_results!($array,array,num_pes,"&[T]");
             println!("passed &[T]");
             //scoped &[T]------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
                 let slice = &vec[..];
-                array.batch_add(slice,1);
+                let _ = array.batch_add(slice,1);
             }
             check_results!($array,array,num_pes,"scoped &[T]");
             println!("passed scoped &[T]");
             // Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
-            array.batch_add(vec,1);
+            let _ = array.batch_add(vec,1);
             check_results!($array,array,num_pes,"Vec<T>");
             println!("passed Vec<T>");
             // &Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
-            array.batch_add(&vec,1);
+            let _ = array.batch_add(&vec,1);
             check_results!($array,array,num_pes,"&Vec<T>");
             println!("passed &Vec<T>");
             // Scoped Vec<T>------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                array.batch_add(vec,1);
+                let _ = array.batch_add(vec,1);
             }
             check_results!($array,array,num_pes,"scoped Vec<T>");
             println!("passed scoped Vec<T>");
             // Scoped &Vec<T>------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                array.batch_add(&vec,1);
+                let _ = array.batch_add(&vec,1);
             }
             check_results!($array,array,num_pes,"scoped &Vec<T>");
             println!("passed scoped &Vec<T>");
@@ -387,7 +385,7 @@ macro_rules! input_test{
                 for i in 0..array.len(){
                     slice[i]=i;
                 }
-                array.batch_add(slice,1);
+                let _ = array.batch_add(slice,1);
                 check_results!($array,array,num_pes,"LMR<T>");
                 println!("passed LMR<T>");
             }
@@ -402,7 +400,7 @@ macro_rules! input_test{
                     slice[i]=i;
                 }
 
-                array.batch_add(slice,1);
+                let _ = array.batch_add(slice,1);
                 check_results!($array,array,num_pes,"SMR<T>");
                 println!("passed SMR<T>");
             }
@@ -411,7 +409,7 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"UnsafeArray<T>");
             // UnsafeArray<T>------------------------------
-            array.batch_add(unsafe{input_array.local_data()},1);
+            let _ = array.batch_add(unsafe{input_array.local_data()},1);
             check_results!($array,array,num_pes,"&UnsafeArray<T>");
             println!("passed &UnsafeArray<T>");
 
@@ -420,7 +418,7 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"ReadOnlyArray<T>");
             // ReadOnlyArray<T>------------------------------
-            array.batch_add(input_array.local_data(),1);
+            let _ = array.batch_add(input_array.local_data(),1);
             check_results!($array,array,num_pes,"&ReadOnlyArray<T>");
             println!("passed &ReadOnlyArray<T>");
 
@@ -429,7 +427,7 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"AtomicArray<T>");
             // AtomicArray<T>------------------------------
-            array.batch_add(&input_array.local_data(),1);
+            let _ = array.batch_add(&input_array.local_data(),1);
             check_results!($array,array,num_pes,"&AtomicArray<T>");
             println!("passed &AtomicArray<T>");
 
@@ -438,7 +436,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
-            array.batch_add(&input_array.blocking_read_local_data(),1);
+            let _ = array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -447,7 +445,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
-            array.batch_add(&input_array.blocking_read_local_data(),1);
+            let _ = array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index 5c6901e1..47948887 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -2,23 +2,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -79,7 +79,7 @@ macro_rules! div_test{
             // array.print();
             for idx in 0..array.len(){
                 for _i in 0..(max_updates as usize){
-                    array.div(idx,2 as $t);
+                    let _ = array.div(idx,2 as $t);
                 }
             }
             array.wait_all();
@@ -106,7 +106,7 @@ macro_rules! div_test{
             // // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(max_updates as usize){
-                    sub_array.div(idx,2 as $t);
+                    let _ = sub_array.div(idx,2 as $t);
                 }
             }
             sub_array.wait_all();
@@ -132,7 +132,7 @@ macro_rules! div_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(max_updates as usize){
-                        sub_array.div(idx,2 as $t);
+                        let _ = sub_array.div(idx,2 as $t);
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 6c91e0fc..80ce4761 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -6,12 +6,12 @@ use rand::distributions::Uniform;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
+        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().enumerate().for_each(move |(_i, x)| {
+        let _ = $array.dist_iter().enumerate().for_each(move |(_i, x)| {
             // println!("{:?} {:?}", i, x.load());
             x.store($init_val)
         });
@@ -19,12 +19,12 @@ macro_rules! initialize_array {
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -287,7 +287,7 @@ macro_rules! fetch_add_test{
 macro_rules! initialize_array2 {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         #[allow(unused_unsafe)]
-        unsafe {
+        let _ = unsafe {
             $array
                 .dist_iter_mut()
                 .enumerate()
@@ -297,7 +297,7 @@ macro_rules! initialize_array2 {
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().enumerate().for_each(move |(i, x)| {
+        let _ = $array.dist_iter().enumerate().for_each(move |(i, x)| {
             // println!("{:?} {:?}", i, x.load());
             x.store(i)
         });
@@ -305,7 +305,7 @@ macro_rules! initialize_array2 {
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array
+        let _ = $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i);
@@ -313,7 +313,7 @@ macro_rules! initialize_array2 {
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array
+        let _ = $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i);
@@ -407,11 +407,11 @@ macro_rules! input_test{
             initialize_array2!($array, array, init_val);
             if $dist == lamellar::array::Distribution::Block{
                 #[allow(unused_unsafe)]
-                unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i%array_total_len});}
+                let _ = unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i%array_total_len})};
             }
             else{
                 #[allow(unused_unsafe)]
-                unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i/num_pes});}
+                let _ = unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i/num_pes})};
             }
 
             array.wait_all();
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index 9e5e97bf..855c5072 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -2,27 +2,27 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
+        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (GenericAtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index 0d15d7ed..94bc1c55 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -2,22 +2,22 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
+        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index e45b41f5..a4d7f340 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -6,23 +6,23 @@ use rand::distributions::Uniform;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index 26916bd5..690861aa 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -2,23 +2,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -86,7 +86,7 @@ macro_rules! mul_test{
             // array.print();
             for idx in 0..array.len(){
                 for _i in 0..(max_updates as usize){
-                    array.mul(idx,2 as $t);
+                    let _ = array.mul(idx,2 as $t);
                 }
             }
             array.wait_all();
@@ -113,7 +113,7 @@ macro_rules! mul_test{
             // // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(max_updates as usize){
-                    sub_array.mul(idx,2 as $t);
+                    let _ =  sub_array.mul(idx,2 as $t);
                 }
             }
             sub_array.wait_all();
@@ -139,7 +139,7 @@ macro_rules! mul_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(max_updates as usize){
-                        sub_array.mul(idx,2 as $t);
+                        let _ = sub_array.mul(idx,2 as $t);
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index 93bca8e8..2ce5405f 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -5,22 +5,22 @@ use rand::distributions::Uniform;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
+        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -85,7 +85,7 @@ macro_rules! sub_test{
 
                         for idx in 0..array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    array.sub(idx,1 as $t);
+                    let _ = array.sub(idx,1 as $t);
                 }
             }
             array.wait_all();
@@ -107,7 +107,7 @@ macro_rules! sub_test{
 
                         for _i in 0..num_updates  as usize{
                 let idx = rand_idx.sample(&mut rng);
-                array.sub(idx,1 as $t);
+                let _ = array.sub(idx,1 as $t);
             }
             array.wait_all();
             array.barrier();
@@ -133,7 +133,7 @@ macro_rules! sub_test{
                         // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    sub_array.sub(idx,1 as $t);
+                    let _ = sub_array.sub(idx,1 as $t);
                 }
             }
             sub_array.wait_all();
@@ -155,7 +155,7 @@ macro_rules! sub_test{
 
                         for _i in 0..num_updates as usize{
                 let idx = rand_idx.sample(&mut rng);
-                sub_array.sub(idx,1 as $t);
+                let _ = sub_array.sub(idx,1 as $t);
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -181,7 +181,7 @@ macro_rules! sub_test{
                 sub_array.barrier();
                                 for idx in 0..sub_array.len(){
                     for _i in 0..(pe_max_val as usize){
-                        sub_array.sub(idx,1 as $t);
+                        let _ = sub_array.sub(idx,1 as $t);
                     }
                 }
                 sub_array.wait_all();
@@ -203,7 +203,7 @@ macro_rules! sub_test{
 
                                 for _i in 0..num_updates as usize{
                     let idx = rand_idx.sample(&mut rng);
-                    sub_array.sub(idx,1 as $t);
+                    let _ = sub_array.sub(idx,1 as $t);
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/atomic_ops/compare_exchange_test.rs b/tests/array/atomic_ops/compare_exchange_test.rs
index a0aa9047..c991aa4c 100644
--- a/tests/array/atomic_ops/compare_exchange_test.rs
+++ b/tests/array/atomic_ops/compare_exchange_test.rs
@@ -2,22 +2,22 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index f70cb1f3..0dd5150d 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -2,22 +2,22 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -64,7 +64,7 @@ macro_rules! load_store_test{
             array.barrier();
             for idx in 0..array.len(){
                 if idx%num_pes == my_pe{
-                    array.store(idx,my_pe as $t);
+                    let _ = array.store(idx,my_pe as $t);
                 }
             }
             array.wait_all();
@@ -97,7 +97,7 @@ macro_rules! load_store_test{
             sub_array.barrier();
             for idx in 0..sub_array.len(){
                 if idx%num_pes == my_pe{
-                    sub_array.store(idx,my_pe as $t);
+                    let _ = sub_array.store(idx,my_pe as $t);
                 }
             }
             sub_array.wait_all();
@@ -133,7 +133,7 @@ macro_rules! load_store_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        sub_array.store(idx,my_pe as $t);
+                        let _ = sub_array.store(idx,my_pe as $t);
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/atomic_ops/swap_test.rs b/tests/array/atomic_ops/swap_test.rs
index ebc129c8..daec218a 100644
--- a/tests/array/atomic_ops/swap_test.rs
+++ b/tests/array/atomic_ops/swap_test.rs
@@ -2,22 +2,22 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 8ce892de..3de097ec 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -3,23 +3,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -67,7 +67,7 @@ macro_rules! and_test{
             array.barrier();
             let my_val = !(1 as $t << my_pe);
             for idx in 0..array.len(){
-                array.bit_and(idx,my_val);
+                let _ = array.bit_and(idx,my_val);
 
             }
             array.wait_all();
@@ -95,7 +95,7 @@ macro_rules! and_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                sub_array.bit_and(idx,my_val);
+                let _ = sub_array.bit_and(idx,my_val);
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -123,7 +123,7 @@ macro_rules! and_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    sub_array.bit_and(idx,my_val);
+                    let _ = sub_array.bit_and(idx,my_val);
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/fetch_and_test.rs b/tests/array/bitwise_ops/fetch_and_test.rs
index 1789ac03..253b495c 100644
--- a/tests/array/bitwise_ops/fetch_and_test.rs
+++ b/tests/array/bitwise_ops/fetch_and_test.rs
@@ -3,23 +3,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/bitwise_ops/fetch_or_test.rs b/tests/array/bitwise_ops/fetch_or_test.rs
index d1c76d69..12673c6a 100644
--- a/tests/array/bitwise_ops/fetch_or_test.rs
+++ b/tests/array/bitwise_ops/fetch_or_test.rs
@@ -3,23 +3,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/bitwise_ops/fetch_xor_test.rs b/tests/array/bitwise_ops/fetch_xor_test.rs
index fae327c4..a8824169 100644
--- a/tests/array/bitwise_ops/fetch_xor_test.rs
+++ b/tests/array/bitwise_ops/fetch_xor_test.rs
@@ -3,23 +3,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index b3c22641..3c3b37fb 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -3,23 +3,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -67,7 +67,7 @@ macro_rules! or_test{
             array.barrier();
             let my_val = 1 as $t << my_pe;
             for idx in 0..array.len(){
-                array.bit_or(idx,my_val);
+                let _ = array.bit_or(idx,my_val);
 
             }
             array.wait_all();
@@ -95,7 +95,7 @@ macro_rules! or_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                sub_array.bit_or(idx,my_val);
+                let _ = sub_array.bit_or(idx,my_val);
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -123,7 +123,7 @@ macro_rules! or_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    sub_array.bit_or(idx,my_val);
+                    let _ = sub_array.bit_or(idx,my_val);
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/xor_test.rs b/tests/array/bitwise_ops/xor_test.rs
index ec8ad457..cb4820a1 100644
--- a/tests/array/bitwise_ops/xor_test.rs
+++ b/tests/array/bitwise_ops/xor_test.rs
@@ -3,23 +3,23 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         $array.wait_all();
         $array.barrier();
     };
@@ -67,7 +67,7 @@ macro_rules! xor_test{
             array.barrier();
             let my_val = 1 as $t << my_pe;
             for idx in 0..array.len(){
-                array.bit_xor(idx,my_val);
+                let _ = array.bit_xor(idx,my_val);
 
             }
             array.wait_all();
@@ -95,7 +95,7 @@ macro_rules! xor_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                sub_array.bit_xor(idx,my_val);
+                let _ = sub_array.bit_xor(idx,my_val);
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -123,7 +123,7 @@ macro_rules! xor_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    sub_array.bit_xor(idx,my_val);
+                    let _ = sub_array.bit_xor(idx,my_val);
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/rdma/blocking_get_test.rs b/tests/array/rdma/blocking_get_test.rs
index 58716d24..ea32b569 100644
--- a/tests/array/rdma/blocking_get_test.rs
+++ b/tests/array/rdma/blocking_get_test.rs
@@ -24,7 +24,7 @@ fn initialize_mem_region<T: Dist + std::ops::AddAssign>(
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$t:ty) => {
         unsafe {
-            $array
+            let _ = $array
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
@@ -32,21 +32,21 @@ macro_rules! initialize_array {
         $array.wait_all();
     };
     (AtomicArray,$array:ident,$t:ty) => {
-        $array
+        let _ = $array
             .dist_iter()
             .enumerate()
             .for_each(move |(i, x)| x.store(i as $t));
         $array.wait_all();
     };
     (LocalLockArray,$array:ident,$t:ty) => {
-        $array
+        let _ = $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
         $array.wait_all();
     };
     (GlobalLockArray,$array:ident,$t:ty) => {
-        $array
+        let _ = $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
@@ -55,7 +55,8 @@ macro_rules! initialize_array {
     (ReadOnlyArray,$array:ident,$t:ty) => {
         let temp = $array.into_unsafe();
         unsafe {
-            temp.dist_iter_mut()
+            let _ = temp
+                .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
         }
@@ -68,7 +69,7 @@ macro_rules! initialize_array_range {
     (UnsafeArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
         unsafe {
-            subarray
+            let _ = subarray
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
@@ -77,7 +78,7 @@ macro_rules! initialize_array_range {
     }};
     (AtomicArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        subarray
+        let _ = subarray
             .dist_iter()
             .enumerate()
             .for_each(move |(i, x)| x.store(i as $t));
@@ -85,7 +86,7 @@ macro_rules! initialize_array_range {
     }};
     (LocalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        subarray
+        let _ = subarray
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
@@ -93,7 +94,7 @@ macro_rules! initialize_array_range {
     }};
     (GlobalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        subarray
+        let _ = subarray
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
@@ -103,7 +104,7 @@ macro_rules! initialize_array_range {
         let temp = $array.into_unsafe();
         let subarray = temp.sub_array($range);
         unsafe {
-            subarray
+            let _ = subarray
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
diff --git a/tests/array/rdma/get_test.rs b/tests/array/rdma/get_test.rs
index 326262e6..18641567 100644
--- a/tests/array/rdma/get_test.rs
+++ b/tests/array/rdma/get_test.rs
@@ -18,7 +18,7 @@ fn initialize_mem_region<T: Dist + std::ops::AddAssign>(
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$t:ty) => {
         unsafe {
-            $array
+            let _ = $array
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
@@ -26,21 +26,21 @@ macro_rules! initialize_array {
         }
     };
     (AtomicArray,$array:ident,$t:ty) => {
-        $array
+        let _ = $array
             .dist_iter()
             .enumerate()
             .for_each(move |(i, x)| x.store(i as $t));
         $array.wait_all();
     };
     (LocalLockArray,$array:ident,$t:ty) => {
-        $array
+        let _ = $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
         $array.wait_all();
     };
     (GlobalLockArray,$array:ident,$t:ty) => {
-        $array
+        let _ = $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
@@ -51,7 +51,8 @@ macro_rules! initialize_array {
         let temp = $array.into_unsafe();
         // println!("unsafe");
         unsafe {
-            temp.dist_iter_mut()
+            let _ = temp
+                .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
             temp.wait_all();
@@ -64,7 +65,7 @@ macro_rules! initialize_array_range {
     (UnsafeArray,$array:ident,$t:ty,$range:expr) => {{
         unsafe {
             let subarray = $array.sub_array($range);
-            subarray
+            let _ = subarray
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
@@ -73,7 +74,7 @@ macro_rules! initialize_array_range {
     }};
     (AtomicArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        subarray
+        let _ = subarray
             .dist_iter()
             .enumerate()
             .for_each(move |(i, x)| x.store(i as $t));
@@ -81,7 +82,7 @@ macro_rules! initialize_array_range {
     }};
     (LocalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        subarray
+        let _ = subarray
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
@@ -89,7 +90,7 @@ macro_rules! initialize_array_range {
     }};
     (GlobalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        subarray
+        let _ = subarray
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
@@ -101,7 +102,7 @@ macro_rules! initialize_array_range {
         // println!("unsafe");
         unsafe {
             let subarray = temp.sub_array($range);
-            subarray
+            let _ = subarray
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t);
@@ -141,7 +142,7 @@ macro_rules! get_test{
                 let num_txs = mem_seg_len/tx_size;
                 for tx in (0..num_txs){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(mem_seg_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
-                    unsafe {array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)));}
                 }
                 array.wait_all();
                 array.barrier();
@@ -181,7 +182,7 @@ macro_rules! get_test{
                 let num_txs = half_len/tx_size;
                 for tx in (0..num_txs){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(half_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)).as_slice());}
-                    unsafe {sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)));}
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
@@ -226,7 +227,7 @@ macro_rules! get_test{
                     let num_txs = len/tx_size;
                     for tx in (0..num_txs){
                         // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
-                        unsafe {sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size))); }
+                        unsafe {let _ = sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size))); }
                     }
                     sub_array.wait_all();
                     sub_array.barrier();
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index c0c049df..ff66fd62 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -18,17 +18,17 @@ fn initialize_mem_region<T: Dist + std::ops::AddAssign>(
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
+        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
     };
 }
 
@@ -57,7 +57,7 @@ macro_rules! put_test{
                 for tx in (my_pe..num_txs).step_by(num_pes){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(mem_seg_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
                     #[allow(unused_unsafe)]
-                    unsafe {array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)));}
                 }
                 array.wait_all();
                 array.barrier();
@@ -91,7 +91,7 @@ macro_rules! put_test{
                 for tx in (my_pe..num_txs).step_by(num_pes){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(half_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)).as_slice());}
                     #[allow(unused_unsafe)]
-                    unsafe {sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)));}
                 }
                 array.wait_all();
                 sub_array.barrier();
@@ -128,7 +128,7 @@ macro_rules! put_test{
                     for tx in (my_pe..num_txs).step_by(num_pes){
                         // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
                         #[allow(unused_unsafe)]
-                        unsafe {sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size)));}
+                        unsafe {let _ = sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size)));}
                     }
                     array.wait_all();
                     sub_array.barrier();

From dff04c7460b20ea1b43253b67fdf2220f90f2d3b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 26 Jan 2024 22:05:55 -0800
Subject: [PATCH 008/116] refactoring to suppport different executor backends +
 tokio backend

---
 Cargo.toml                                    |   5 +-
 examples/array_examples/global_lock_array.rs  |   6 +-
 .../global_lock_atomic_array_put_bw.rs        |   2 +-
 .../local_lock_atomic_array_put_bw.rs         |   3 +-
 examples/darc_examples/darc.rs                |   6 +-
 .../safe_parallel_blocked_array_gemm.rs       |   2 +-
 src/active_messaging.rs                       |  90 ++-
 src/active_messaging/batching.rs              |  88 ++-
 .../batching/simple_batcher.rs                | 207 +++---
 .../batching/team_am_batcher.rs               | 345 +++++-----
 .../registered_active_message.rs              | 100 ++-
 src/array.rs                                  |  34 +-
 src/array/atomic.rs                           |  10 +-
 src/array/generic_atomic.rs                   |  15 +-
 src/array/global_lock_atomic.rs               | 362 +++++-----
 src/array/global_lock_atomic/iteration.rs     |  15 +-
 .../distributed_iterator/consumer/count.rs    |   4 +-
 .../distributed_iterator/consumer/reduce.rs   |   2 +-
 .../iterator/one_sided_iterator/buffered.rs   |   4 -
 src/array/local_lock_atomic.rs                | 354 ++++------
 src/array/local_lock_atomic/iteration.rs      |  21 +-
 src/array/native_atomic.rs                    |  15 +-
 src/array/operations.rs                       |  71 +-
 src/array/read_only.rs                        |  15 +-
 src/array/unsafe.rs                           |  21 +-
 src/array/unsafe/operations.rs                | 103 ++-
 src/barrier.rs                                |  29 +-
 src/darc.rs                                   |  28 +-
 src/darc/global_rw_darc.rs                    | 407 +++++------
 src/darc/local_rw_darc.rs                     | 337 +++++----
 src/lamellae/command_queues.rs                |   5 +-
 src/lamellae/rofi_lamellae.rs                 |   2 +-
 src/lamellae/shmem_lamellae.rs                |   2 +-
 src/lamellar_request.rs                       |   2 +-
 src/lamellar_task_group.rs                    |   2 +-
 src/lamellar_team.rs                          |  12 +-
 src/lamellar_world.rs                         |  62 +-
 src/lib.rs                                    |   2 +-
 src/scheduler.rs                              | 337 ++++++---
 src/scheduler/numa_work_stealing.rs           |   7 +-
 src/scheduler/numa_work_stealing2.rs          |   2 +-
 src/scheduler/tokio.rs                        |  88 +++
 src/scheduler/work_stealing.rs                | 637 +++---------------
 tests/array/arithmetic_ops/add_test.rs        |   4 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |   4 +-
 45 files changed, 1692 insertions(+), 2177 deletions(-)
 create mode 100644 src/scheduler/tokio.rs

diff --git a/Cargo.toml b/Cargo.toml
index e33345bd..fd38e93d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,7 +53,7 @@ pin-project = "1.0.12"
 serde_with = "3.0.0"
 pin-weak = "1.1.0"
 async-lock = "2.8.0"
-itertools = "0.12.1"
+tokio = { version = "1.35.1", features = ["full"] , optional = true}
 
 
 [dev-dependencies]
@@ -75,13 +75,14 @@ members = ["impl"]
 #features are strictly additive.... can't have mutual exclusitivity
 [features]
 enable-rofi=["rofisys", "libc"]
+tokio-executor=["tokio"]
 slurm-test=[]
 default=[]
 
 
 [profile.release]
 opt-level=3
-lto=true
+lto=false
 codegen-units=1
 debug = true   
 
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 81c0420c..8b904396 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -9,7 +9,7 @@ fn main() {
     let array = GlobalLockArray::<usize>::new(&world, 100, Distribution::Block);
 
     let s = Instant::now();
-    let local_data = array.block_on(array.read_local_data());
+    let local_data = array.blocking_read_local_data();
     println!(
         "PE{my_pe} time: {:?} {:?}",
         s.elapsed().as_secs_f64(),
@@ -19,7 +19,7 @@ fn main() {
     drop(local_data); //release the lock
 
     world.barrier();
-    let mut local_data = array.block_on(array.write_local_data());
+    let mut local_data = array.blocking_write_local_data();
     println!(
         "PE{my_pe} time: {:?} got write lock",
         s.elapsed().as_secs_f64()
@@ -31,7 +31,7 @@ fn main() {
     array.print();
     println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
-    let mut local_data = array.block_on(array.collective_write_local_data());
+    let mut local_data = array.blocking_collective_write_local_data();
     println!(
         "PE{my_pe} time: {:?} got collective write lock",
         s.elapsed().as_secs_f64()
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index 7ae804b4..2c072a7e 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -67,7 +67,7 @@ fn main() {
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == 0 {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.block_on(array.read_local_data());
+                let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 19b963a5..d75402eb 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -67,7 +67,8 @@ fn main() {
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == num_pes - 1 {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.block_on(array.read_local_data());
+                let array_clone = array.clone();
+                let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index ffa32011..db50e19b 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -61,10 +61,10 @@ fn main() {
 
     let global_darc = GlobalRwDarc::new(world.team(), 0).unwrap();
     println!("here 2");
-    let read_lock = world.block_on(global_darc.read());
+    let read_lock = global_darc.blocking_read();
     println!("I have the read lock!!!! {:?}", my_pe);
     drop(read_lock);
-    let write_lock = world.block_on(global_darc.write());
+    let write_lock = global_darc.blocking_write();
     println!("I have the write lock!!!! {:?}", my_pe);
     std::thread::sleep(std::time::Duration::from_secs(1));
     drop(write_lock);
@@ -112,7 +112,7 @@ fn main() {
             println!("here 8");
         } else {
             // println!("here");
-            *(*world.block_on(local_darc.write())) += 1;
+            *local_darc.blocking_write() += 1;
         }
     }
     // --------
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index dd171fdd..a5ed9544 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -197,7 +197,7 @@ fn main() {
                         );
                     }
 
-                    let mut c_slice = c.block_on(c.write_local_data()); //this locks the array
+                    let mut c_slice = c.blocking_write_local_data(); //this locks the array
 
                     for row in 0..blocksize {
                         let row_offset = (i_blk * blocksize + row) * n;
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index ea0b37d5..afc8ab6c 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -638,7 +638,7 @@ use crate::lamellar_arch::IdError;
 use crate::lamellar_request::{InternalResult, LamellarRequestResult};
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::NetMemRegionHandle;
-use crate::scheduler::{ReqId, SchedulerQueue};
+use crate::scheduler::{Executor, LamellarExecutor, ReqId};
 // use log::trace;
 use async_trait::async_trait;
 use futures::Future;
@@ -856,9 +856,6 @@ pub(crate) enum Am {
     Return(ReqMetaData, LamellarArcAm), //req data, am to return and execute
     Data(ReqMetaData, LamellarResultArc), //req data, data to return
     Unit(ReqMetaData),                  //req data
-    _BatchedReturn(ReqMetaData, LamellarArcAm, ReqId), //req data, am to return and execute, batch id
-    _BatchedData(ReqMetaData, LamellarResultArc, ReqId), //req data, data to return, batch id
-    _BatchedUnit(ReqMetaData, ReqId),                  //req data, batch id
 }
 
 impl std::fmt::Debug for Am {
@@ -870,9 +867,6 @@ impl std::fmt::Debug for Am {
             Am::Return(_, _) => write!(f, "Return"),
             Am::Data(_, _) => write!(f, "Data"),
             Am::Unit(_) => write!(f, "Unit"),
-            Am::_BatchedReturn(_, _, _) => write!(f, "BatchedReturn"),
-            Am::_BatchedData(_, _, _) => write!(f, "BatchedData"),
-            Am::_BatchedUnit(_, _) => write!(f, "BatchedUnit"),
         }
     }
 }
@@ -1178,27 +1172,25 @@ pub trait ActiveMessaging {
     ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
     /// });
     ///```
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future;
+    fn block_on<F: Future>(&self, f: F) -> F::Output;
 }
 
 #[async_trait]
 pub(crate) trait ActiveMessageEngine {
     async fn process_msg(
-        &self,
+        self,
         am: Am,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        scheduler: Arc<Executor>,
         stall_mark: usize,
         immediate: bool,
     );
 
     async fn exec_msg(
-        &self,
+        self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        scheduler: Arc<Executor>,
     );
 
     fn get_team_and_world(
@@ -1232,39 +1224,39 @@ pub(crate) trait ActiveMessageEngine {
     }
 }
 
-#[derive(Debug)]
-pub(crate) enum ActiveMessageEngineType {
-    RegisteredActiveMessages(Arc<RegisteredActiveMessages>),
-}
+// #[derive(Debug)]
+// pub(crate) enum ActiveMessageEngineType {
+//     RegisteredActiveMessages(RegisteredActiveMessages),
+// }
 
-#[async_trait]
-impl ActiveMessageEngine for ActiveMessageEngineType {
-    async fn process_msg(
-        &self,
-        am: Am,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        stall_mark: usize,
-        immediate: bool,
-    ) {
-        match self {
-            ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-                remote_am
-                    .process_msg(am, scheduler, stall_mark, immediate)
-                    .await;
-            }
-        }
-    }
-    async fn exec_msg(
-        &self,
-        msg: Msg,
-        ser_data: SerializedData,
-        lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-    ) {
-        match self {
-            ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-                remote_am.exec_msg(msg, ser_data, lamellae, scheduler).await;
-            }
-        }
-    }
-}
+// #[async_trait]
+// impl ActiveMessageEngine for ActiveMessageEngineType {
+//     async fn process_msg(
+//         self,
+//         am: Am,
+//         executor: Arc<Executor>,
+//         stall_mark: usize,
+//         immediate: bool,
+//     ) {
+//         match self {
+//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
+//                 remote_am
+//                     .process_msg(am, executor, stall_mark, immediate)
+//                     .await;
+//             }
+//         }
+//     }
+//     async fn exec_msg(
+//         self,
+//         msg: Msg,
+//         ser_data: SerializedData,
+//         lamellae: Arc<Lamellae>,
+//         executor: Arc<Executor>,
+//     ) {
+//         match self {
+//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
+//                 remote_am.exec_msg(msg, ser_data, lamellae, executor).await;
+//             }
+//         }
+//     }
+// }
diff --git a/src/active_messaging/batching.rs b/src/active_messaging/batching.rs
index 78447239..11882eb3 100644
--- a/src/active_messaging/batching.rs
+++ b/src/active_messaging/batching.rs
@@ -30,50 +30,41 @@ impl std::fmt::Debug for LamellarData {
 
 #[async_trait]
 pub(crate) trait Batcher {
-    fn add_remote_am_to_batch(
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     );
-    fn add_return_am_to_batch(
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     );
-    fn add_data_am_to_batch(
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        stall_mark: usize,
-    );
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     );
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, stall_mark: usize);
 
     async fn exec_batched_msg(
         &self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    );
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am>;
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) enum BatcherType {
     Simple(SimpleBatcher),
     TeamAm(TeamAmBatcher),
@@ -81,75 +72,79 @@ pub(crate) enum BatcherType {
 
 #[async_trait]
 impl Batcher for BatcherType {
-    //#[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_remote_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_remote_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
         }
     }
-    //#[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_return_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_return_am_to_batch(req_data, am, am_id, am_size, scheduler, stall_mark)
+                batcher
+                    .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                    .await
             }
         }
     }
-    //#[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         stall_mark: usize,
     ) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark)
+                batcher
+                    .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                    .await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark)
+                batcher
+                    .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                    .await
             }
         }
     }
-    //#[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        stall_mark: usize,
-    ) {
+    // #[tracing::instrument(skip_all)]
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, stall_mark: usize) {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher.add_unit_am_to_batch(req_data, scheduler, stall_mark)
+                batcher.add_unit_am_to_batch(req_data, stall_mark).await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher.add_unit_am_to_batch(req_data, scheduler, stall_mark)
+                batcher.add_unit_am_to_batch(req_data, stall_mark).await
             }
         }
     }
@@ -159,19 +154,14 @@ impl Batcher for BatcherType {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
         match self {
             BatcherType::Simple(batcher) => {
-                batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, ame)
-                    .await;
+                batcher.exec_batched_msg(msg, ser_data, lamellae, ame).await
             }
             BatcherType::TeamAm(batcher) => {
-                batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, ame)
-                    .await;
+                batcher.exec_batched_msg(msg, ser_data, lamellae, ame).await
             }
         }
     }
diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index 1d27654e..c63c3fa6 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -52,7 +52,7 @@ impl SimpleBatcherInner {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct SimpleBatcher {
     batched_ams: Arc<Vec<SimpleBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
@@ -60,14 +60,13 @@ pub(crate) struct SimpleBatcher {
 
 #[async_trait]
 impl Batcher for SimpleBatcher {
-    //#[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         // println!("add_remote_am_to_batch");
@@ -93,37 +92,34 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_remote_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("remote size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_remote_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         // println!("add_return_am_to_batch");
@@ -149,36 +145,33 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_rerturn_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("return size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_return_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         // println!("add_data_am_to_batch");
@@ -207,36 +200,29 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_data_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("data size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_data_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        mut stall_mark: usize,
-    ) {
+    // #[tracing::instrument(skip_all)]
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
         // println!("add_unit_am_to_batch");
         //let dst =req_data.dst;
         let batch = match req_data.dst {
@@ -256,26 +242,24 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_unit_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    SimpleBatcher::create_tx_task(batch).await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                SimpleBatcher::create_tx_task(batch).await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             // println!("unit size: {:?} {dst:?}",size);
             // println!(
             //     "[{:?}] add_unit_am_to_batch submit imm task",
             //     std::thread::current().id()
             // );
-            scheduler.submit_immediate_task(SimpleBatcher::create_tx_task(batch));
+            SimpleBatcher::create_tx_task(batch).await;
         }
     }
 
@@ -285,28 +269,29 @@ impl Batcher for SimpleBatcher {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("executing batched msg {:?}", data.len());
-
+        let mut return_ams = Vec::new();
         while i < data.len() {
             let cmd: Cmd = crate::deserialize(&data[i..i + *CMD_LEN], false).unwrap();
             i += *CMD_LEN;
             // let temp_i = i;
             // println!("cmd {:?}", cmd);
             match cmd {
-                Cmd::Am => self.exec_am(&msg, data, &mut i, &lamellae, scheduler.clone(), ame),
+                Cmd::Am => return_ams.push(self.exec_am(&msg, data, &mut i, &lamellae, ame).await),
                 Cmd::ReturnAm => {
-                    self.exec_return_am(&msg, data, &mut i, &lamellae, scheduler.clone(), ame)
+                    self.exec_return_am(&msg, data, &mut i, &lamellae, ame)
+                        .await
                 }
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => panic!("should not recieve a batched msg within a batched msg"),
             }
         }
+        return_ams
     }
 }
 
@@ -520,16 +505,15 @@ impl SimpleBatcher {
         data.unwrap()
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn exec_am(
+    // #[tracing::instrument(skip_all)]
+    async fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Am {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -550,39 +534,35 @@ impl SimpleBatcher {
             team_addr: team.team.remote_ptr_addr,
         };
         // println!("[{:?}] exec_am submit task", std::thread::current().id());
-        let scheduler_clone = scheduler.clone();
-        let ame_clone = ame.clone();
-        scheduler.submit_task(async move {
-            let am = match am
-                .exec(
-                    team.team.world_pe,
-                    team.team.num_world_pes,
-                    false,
-                    world.clone(),
-                    team.clone(),
-                )
-                .await
-            {
-                LamellarReturn::Unit => Am::Unit(req_data),
-                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                    panic!("Should not be returning local data or AM from remote  am");
-                }
-            };
-            ame_clone.process_msg(am, scheduler_clone, 0, false).await;
-        });
+        let am = match am
+            .exec(
+                team.team.world_pe,
+                team.team.num_world_pes,
+                false,
+                world.clone(),
+                team.clone(),
+            )
+            .await
+        {
+            LamellarReturn::Unit => Am::Unit(req_data),
+            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                panic!("Should not be returning local data or AM from remote  am");
+            }
+        };
+        // ame.process_msg(am, 0, false).await;
+        am
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn exec_return_am(
+    // #[tracing::instrument(skip_all)]
+    async fn exec_return_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
+        ame: &RegisteredActiveMessages,
     ) {
         // println!("exec_return_am");
         let am_header: AmHeader =
@@ -606,9 +586,8 @@ impl SimpleBatcher {
         //     "[{:?}] exec_return_am submit task",
         //     std::thread::current().id()
         // );
-        scheduler.submit_task(
-            ame.clone()
-                .exec_local_am(req_data, am.as_local(), world, team),
-        );
+        ame.clone()
+            .exec_local_am(req_data, am.as_local(), world, team)
+            .await;
     }
 }
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index 84281662..67bb7b09 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -167,7 +167,7 @@ impl TeamAmBatcherInner {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct TeamAmBatcher {
     batched_ams: Arc<Vec<TeamAmBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
@@ -175,14 +175,13 @@ pub(crate) struct TeamAmBatcher {
 
 #[async_trait]
 impl Batcher for TeamAmBatcher {
-    //#[tracing::instrument(skip_all)]
-    fn add_remote_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_remote_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -198,49 +197,43 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("remote batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("remote size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn add_return_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_return_am_to_batch(
         &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
         am_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -256,48 +249,43 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("return batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("return size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn add_data_am_to_batch(
+    // #[tracing::instrument(skip_all)]
+    async fn add_data_am_to_batch(
         &self,
         req_data: ReqMetaData,
         data: LamellarResultArc,
         data_size: usize,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
         mut stall_mark: usize,
     ) {
         let batch = match req_data.dst {
@@ -320,48 +308,38 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("data batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("data size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn add_unit_am_to_batch(
-        &self,
-        req_data: ReqMetaData,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        mut stall_mark: usize,
-    ) {
+    // #[tracing::instrument(skip_all)]
+    async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -375,38 +353,33 @@ impl Batcher for TeamAmBatcher {
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
             // println!("unit batch_id {batch_id} created");
             let cur_stall_mark = self.stall_mark.clone();
-            scheduler.submit_task(async move {
-                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
-                {
-                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                    async_std::task::yield_now().await;
-                }
-                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                    //this batch is still valid
-                    TeamAmBatcher::create_tx_task(
-                        batch,
-                        // stall_mark,
-                        // scheduler,
-                        req_data.lamellae.clone(),
-                        req_data.team.arch.clone(),
-                        req_data.team.world_pe,
-                    )
-                    .await;
-                }
-            });
+            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                && batch_id == batch.batch_id.load(Ordering::SeqCst)
+            {
+                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                async_std::task::yield_now().await;
+            }
+            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                //this batch is still valid
+                TeamAmBatcher::create_tx_task(
+                    batch,
+                    req_data.lamellae.clone(),
+                    req_data.team.arch.clone(),
+                    req_data.team.world_pe,
+                )
+                .await;
+            }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("unit size: {:?}",size);
-            scheduler.submit_immediate_task(TeamAmBatcher::create_tx_task(
+            TeamAmBatcher::create_tx_task(
                 batch,
-                // stall_mark,
-                // scheduler,
                 req_data.lamellae.clone(),
                 req_data.team.arch.clone(),
                 req_data.team.world_pe,
-            ));
+            )
+            .await;
         }
     }
 
@@ -416,9 +389,8 @@ impl Batcher for TeamAmBatcher {
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("i: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
@@ -435,18 +407,12 @@ impl Batcher for TeamAmBatcher {
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => {
-                    self.exec_batched_am(
-                        &msg,
-                        batch.cnt,
-                        data,
-                        &mut i,
-                        &lamellae,
-                        scheduler.clone(),
-                        &ame,
-                    );
+                    self.exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
+                        .await;
                 }
             }
         }
+        Vec::new()
     }
 }
 
@@ -706,17 +672,17 @@ impl TeamAmBatcher {
         data.unwrap()
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn exec_batched_am(
+    // #[tracing::instrument(skip_all)]
+    async fn exec_batched_am(
         &self,
         msg: &Msg,
         batch_cnt: usize,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
-    ) {
+        ame: &RegisteredActiveMessages,
+    ) -> Vec<Am> {
+        let mut return_ams = Vec::new();
         for _team in 0..batch_cnt {
             let team_header: TeamHeader =
                 crate::deserialize(&data[*i..*i + *TEAM_HEADER_LEN], false).unwrap();
@@ -733,48 +699,50 @@ impl TeamAmBatcher {
                 for _am in 0..batched_am_header.am_cnt {
                     // println!("am cmd: {:?}", batched_am_header.cmd);
                     match batched_am_header.cmd {
-                        Cmd::Am => self.exec_am(
-                            msg,
-                            data,
-                            i,
-                            lamellae,
-                            scheduler.clone(),
-                            ame,
-                            batched_am_header.am_id,
-                            world.clone(),
-                            team.clone(),
-                        ),
-                        Cmd::ReturnAm => self.exec_return_am(
-                            msg,
-                            data,
-                            i,
-                            lamellae,
-                            scheduler.clone(),
-                            ame,
-                            batched_am_header.am_id,
-                            world.clone(),
-                            team.clone(),
+                        Cmd::Am => return_ams.push(
+                            self.exec_am(
+                                msg,
+                                data,
+                                i,
+                                lamellae,
+                                batched_am_header.am_id,
+                                world.clone(),
+                                team.clone(),
+                            )
+                            .await,
                         ),
+                        Cmd::ReturnAm => {
+                            self.exec_return_am(
+                                msg,
+                                data,
+                                i,
+                                lamellae,
+                                ame,
+                                batched_am_header.am_id,
+                                world.clone(),
+                                team.clone(),
+                            )
+                            .await
+                        }
                         _ => panic!("unhandled cmd"),
                     }
                 }
             }
         }
+        return_ams
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn exec_am(
+    // #[tracing::instrument(skip_all)]
+    async fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
         am_id: AmId,
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
-    ) {
+    ) -> Am {
         let req_id = crate::deserialize(&data[*i..*i + *REQ_ID_LEN], false).unwrap();
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
@@ -789,39 +757,36 @@ impl TeamAmBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        let scheduler_clone = scheduler.clone();
-        let ame_clone = ame.clone();
-        scheduler.submit_task(async move {
-            let am = match am
-                .exec(
-                    team.team.world_pe,
-                    team.team.num_world_pes,
-                    false,
-                    world.clone(),
-                    team.clone(),
-                )
-                .await
-            {
-                LamellarReturn::Unit => Am::Unit(req_data),
-                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                    panic!("Should not be returning local data or AM from remote  am");
-                }
-            };
-            ame_clone.process_msg(am, scheduler_clone, 0, false).await;
-        });
+
+        let am = match am
+            .exec(
+                team.team.world_pe,
+                team.team.num_world_pes,
+                false,
+                world.clone(),
+                team.clone(),
+            )
+            .await
+        {
+            LamellarReturn::Unit => Am::Unit(req_data),
+            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                panic!("Should not be returning local data or AM from remote  am");
+            }
+        };
+        am
+        // ame.process_msg(am, 0, false).await;
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn exec_return_am(
+    // #[tracing::instrument(skip_all)]
+    async fn exec_return_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: &Arc<RegisteredActiveMessages>,
+        ame: &RegisteredActiveMessages,
         am_id: AmId,
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
@@ -840,9 +805,9 @@ impl TeamAmBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        scheduler.submit_task(
-            ame.clone()
-                .exec_local_am(req_data, am.as_local(), world, team),
-        );
+
+        ame.clone()
+            .exec_local_am(req_data, am.as_local(), world, team)
+            .await;
     }
 }
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index d3a54c8b..e64b8874 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -6,7 +6,6 @@ use crate::lamellae::{
     SerializedData, SubData,
 };
 
-use crate::scheduler::SchedulerQueue;
 use async_recursion::async_recursion;
 // use log::trace;
 use std::sync::Arc;
@@ -62,7 +61,7 @@ pub struct RegisteredAm {
 }
 crate::inventory::collect!(RegisteredAm);
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) struct RegisteredActiveMessages {
     batcher: BatcherType,
 }
@@ -100,9 +99,9 @@ pub(crate) struct UnitHeader {
 impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
     //#[tracing::instrument(skip_all)]
     async fn process_msg(
-        &self,
+        self,
         am: Am,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        executor: Arc<Executor>,
         stall_mark: usize,
         immediate: bool,
     ) {
@@ -118,14 +117,15 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 {
                     // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
                     if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                        self.batcher.add_remote_am_to_batch(
-                            req_data.clone(),
-                            am.clone(),
-                            am_id,
-                            am_size,
-                            scheduler,
-                            stall_mark,
-                        );
+                        self.batcher
+                            .add_remote_am_to_batch(
+                                req_data.clone(),
+                                am.clone(),
+                                am_id,
+                                am_size,
+                                stall_mark,
+                            )
+                            .await;
                     } else {
                         self.send_am(req_data.clone(), am.clone(), am_id, am_size, Cmd::Am)
                             .await;
@@ -150,9 +150,9 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                     let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                     let am_size = am.serialized_size();
                     if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                        self.batcher.add_remote_am_to_batch(
-                            req_data, am, am_id, am_size, scheduler, stall_mark,
-                        );
+                        self.batcher
+                            .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                            .await;
                     } else {
                         self.send_am(req_data, am, am_id, am_size, Cmd::Am).await;
                     }
@@ -168,9 +168,9 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                 let am_size = am.serialized_size();
                 if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                    self.batcher.add_return_am_to_batch(
-                        req_data, am, am_id, am_size, scheduler, stall_mark,
-                    );
+                    self.batcher
+                        .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
+                        .await;
                 } else {
                     self.send_am(req_data, am, am_id, am_size, Cmd::ReturnAm)
                         .await;
@@ -181,7 +181,8 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 let data_size = data.serialized_size();
                 if data_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
                     self.batcher
-                        .add_data_am_to_batch(req_data, data, data_size, scheduler, stall_mark);
+                        .add_data_am_to_batch(req_data, data, data_size, stall_mark)
+                        .await;
                 } else {
                     self.send_data_am(req_data, data, data_size).await;
                 }
@@ -189,60 +190,31 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
             Am::Unit(req_data) => {
                 if *UNIT_HEADER_LEN < crate::active_messaging::BATCH_AM_SIZE && !immediate {
                     self.batcher
-                        .add_unit_am_to_batch(req_data, scheduler, stall_mark);
+                        .add_unit_am_to_batch(req_data, stall_mark)
+                        .await;
                 } else {
                     self.send_unit_am(req_data).await;
                 }
             }
-            Am::_BatchedReturn(_req_data, _func, _batch_id) => {
-                // let func_id = *(AMS_IDS.get(&func.get_id()).unwrap());
-                // let func_size = func.serialized_size();
-                // if func_size <= crate::active_messaging::BATCH_AM_SIZE {
-                //     self.batcher
-                //         .add_batched_return_am_to_batch(
-                //             req_data, func, func_id, func_size, batch_id, scheduler,stall_mark
-                //         )
-                //         .await;
-                // } else {
-                //     self.send_batched_return_am(
-                //         req_data, func, func_id, func_size, batch_id, scheduler,
-                //     )
-                //     .await;
-                // }
-            }
-            Am::_BatchedData(_req_data, _data, _batch_id) => {
-                // let data_size = data.serialized_size();
-                // if data_size <= crate::active_messaging::BATCH_AM_SIZE {
-                //     self.add_batched_data_am_to_batch(
-                //         req_data, data, data_size, batch_id, scheduler,stall_mark
-                //     )
-                //     .await;
-                // } else {
-                //     self.send_batched_data_am(req_data, data, data_size, batch_id, scheduler)
-                //         .await;
-                // }
-            }
-            Am::_BatchedUnit(_req_data, _batch_id) => {
-                // self.add_batched_unit_am_to_batch(req_data, batch_id, scheduler,stall_mark)
-                //     .await;
-            }
         }
     }
 
     //#[tracing::instrument(skip_all)]
     async fn exec_msg(
-        &self,
+        self,
         msg: Msg,
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
+        executor: Arc<Executor>,
     ) {
         // println!("exec_msg");
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         match msg.cmd {
             Cmd::Am => {
-                self.exec_am(&msg, data, &mut i, &lamellae, scheduler).await;
+                let return_am = self.exec_am(&msg, data, &mut i, &lamellae).await;
+                let process_task = self.process_msg(return_am, executor.clone(), 0, false);
+                executor.submit_task(process_task);
             }
             Cmd::ReturnAm => {
                 self.exec_return_am(&msg, data, &mut i, &lamellae).await;
@@ -254,9 +226,15 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 self.exec_unit_am(&msg, data, &mut i).await;
             }
             Cmd::BatchedMsg => {
-                self.batcher
-                    .exec_batched_msg(msg, ser_data, lamellae, scheduler, self)
+                let ams = self
+                    .batcher
+                    .exec_batched_msg(msg, ser_data, lamellae, &self)
                     .await;
+                let am_tasks = futures::stream::FuturesUnordered::new();
+                for am in ams.into_iter() {
+                    am_tasks.push(self.clone().process_msg(am, executor.clone(), 0, false));
+                }
+                executor.submit_task(futures::future::join_all(am_tasks));
             }
         }
     }
@@ -459,8 +437,7 @@ impl RegisteredActiveMessages {
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-    ) {
+    ) -> Am {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -498,9 +475,8 @@ impl RegisteredActiveMessages {
                 panic!("Should not be returning local data or AM from remote  am");
             }
         };
-        self.process_msg(am, scheduler, 0, false).await; //0 just means we will force a stall_count loop
-                                                         // scheduler.submit_am(am);
-                                                         //TODO: compare against: scheduler.submit_am(ame, am).await;
+        am
+        // self.process_msg(am, 0, false).await; //0 just means we will force a stall_count loop
     }
 
     //#[tracing::instrument(skip_all)]
diff --git a/src/array.rs b/src/array.rs
index a25a58f3..0dbea2d8 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -663,7 +663,7 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarWriteArra
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
             LamellarReadArray::AtomicArray(array) => array.reduce(reduction),
@@ -677,7 +677,7 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T>
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LamellarReadArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.sum() },
             LamellarReadArray::AtomicArray(array) => array.sum(),
@@ -686,7 +686,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
             LamellarReadArray::ReadOnlyArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.prod() },
             LamellarReadArray::AtomicArray(array) => array.prod(),
@@ -699,7 +699,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LamellarReadArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.max() },
             LamellarReadArray::AtomicArray(array) => array.max(),
@@ -708,7 +708,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
             LamellarReadArray::ReadOnlyArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarReadArray::UnsafeArray(array) => unsafe { array.min() },
             LamellarReadArray::AtomicArray(array) => array.min(),
@@ -720,7 +720,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
             LamellarWriteArray::AtomicArray(array) => array.reduce(reduction),
@@ -732,7 +732,7 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LamellarWriteArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.sum() },
             LamellarWriteArray::AtomicArray(array) => array.sum(),
@@ -740,7 +740,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
             LamellarWriteArray::GlobalLockArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.prod() },
             LamellarWriteArray::AtomicArray(array) => array.prod(),
@@ -753,7 +753,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LamellarWriteArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.max() },
             LamellarWriteArray::AtomicArray(array) => array.max(),
@@ -761,7 +761,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompa
             LamellarWriteArray::GlobalLockArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             LamellarWriteArray::UnsafeArray(array) => unsafe { array.min() },
             LamellarWriteArray::AtomicArray(array) => array.min(),
@@ -991,9 +991,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> {
     /// let result = array.block_on(request); //block until am has executed
     /// // we also could have used world.block_on() or team.block_on()
     ///```
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future;
+    fn block_on<F: Future>(&self, f: F) -> F::Output;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Given a global index, calculate the PE and offset on that PE where the element actually resides.
@@ -1610,7 +1608,7 @@ where
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>>;
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// Interface for common arithmetic based reductions
@@ -1643,7 +1641,7 @@ where
     /// let sum = array.block_on(array.sum());
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
@@ -1668,7 +1666,7 @@ where
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// Interface for common compare based reductions
@@ -1696,7 +1694,7 @@ where
     /// let max = array.block_on(array.max());
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
@@ -1718,7 +1716,7 @@ where
     /// let min = array.block_on(array.min());
     /// assert_eq!(0,min);
     ///```
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>>;
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
 }
 
 /// This procedural macro is used to enable the execution of user defined reductions on LamellarArrays.
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 9a4a68f4..896876b0 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1160,7 +1160,7 @@ impl<T: Dist> From<AtomicByteArray> for AtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
             AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
@@ -1171,13 +1171,13 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for AtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.sum(),
             AtomicArray::GenericAtomicArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.prod(),
             AtomicArray::GenericAtomicArray(array) => array.prod(),
@@ -1187,13 +1187,13 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticR
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for AtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.max(),
             AtomicArray::GenericAtomicArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.min(),
             AtomicArray::GenericAtomicArray(array) => array.min(),
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 2924f8d8..382059a4 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -712,10 +712,7 @@ impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -795,7 +792,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GenericAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GenericAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -804,20 +801,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GenericAtomicArray<T
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for GenericAtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for GenericAtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 4d288499..6b9ff9ef 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -25,7 +25,7 @@ use std::ops::{Deref, DerefMut};
 /// Direct RDMA operations can occur if the appropriate lock is held.
 #[lamellar_impl::AmDataRT(Clone, Debug)]
 pub struct GlobalLockArray<T> {
-    lock: GlobalRwDarc<()>,
+    pub(crate) lock: GlobalRwDarc<()>,
     pub(crate) array: UnsafeArray<T>,
 }
 
@@ -70,27 +70,26 @@ impl GlobalLockByteArrayWeak {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct GlobalLockMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
+pub struct GlobalLockMutLocalData<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
     _lock_guard: GlobalRwDarcWriteGuard<()>,
 }
 
-// impl<T: Dist> Drop for GlobalLockMutLocalData<'_, T>{
+// impl<T: Dist> Drop for GlobalLockMutLocalData<T>{
 //     fn drop(&mut self){
 //         println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for GlobalLockMutLocalData<'_, T> {
+impl<T: Dist> Deref for GlobalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for GlobalLockMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for GlobalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -103,27 +102,26 @@ impl<T: Dist> DerefMut for GlobalLockMutLocalData<'_, T> {
 ///
 /// When each PE drops its instance, the lock is release.
 #[derive(Debug)]
-pub struct GlobalLockCollectiveMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
+pub struct GlobalLockCollectiveMutLocalData<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
     _lock_guard: GlobalRwDarcCollectiveWriteGuard<()>,
 }
 
-// impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<'_, T>{
+// impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<T>{
 //     fn drop(&mut self){
 //         println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for GlobalLockCollectiveMutLocalData<'_, T> {
+impl<T: Dist> Deref for GlobalLockCollectiveMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -135,33 +133,29 @@ impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<'_, T> {
 /// (allowing for the safe deref into `&[T]`), preventing any local or remote write access.
 ///
 /// When the instance is dropped the lock is released.
-pub struct GlobalLockLocalData<'a, T: Dist> {
+pub struct GlobalLockLocalData<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
-    pub(crate) data: &'a [T],
-    index: usize,
     lock: GlobalRwDarc<()>,
     lock_guard: GlobalRwDarcReadGuard<()>,
 }
 
-impl<'a, T: Dist + std::fmt::Debug> std::fmt::Debug for GlobalLockLocalData<'a, T> {
+impl<T: Dist + std::fmt::Debug> std::fmt::Debug for GlobalLockLocalData<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:?}", self.data)
+        write!(f, "{:?}", self.deref())
     }
 }
 
-impl<'a, T: Dist> Clone for GlobalLockLocalData<'a, T> {
+impl<T: Dist> Clone for GlobalLockLocalData<T> {
     fn clone(&self) -> Self {
         GlobalLockLocalData {
             array: self.array.clone(),
-            data: self.data,
-            index: self.index,
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-impl<'a, T: Dist> GlobalLockLocalData<'a, T> {
+impl<T: Dist> GlobalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
     /// # Examples
@@ -177,27 +171,30 @@ impl<'a, T: Dist> GlobalLockLocalData<'a, T> {
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
-    pub fn into_sub_data(self, start: usize, end: usize) -> GlobalLockLocalData<'a, T> {
+    pub fn into_sub_data(self, start: usize, end: usize) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
-            array: self.array.clone(),
-            data: &self.data[start..end],
-            index: 0,
+            array: self.array.sub_array(start..end),
             lock: self.lock,
             lock_guard: self.lock_guard,
         }
     }
 }
 
-impl<'a, T: Dist + serde::Serialize> serde::Serialize for GlobalLockLocalData<'a, T> {
+impl<T: Dist + serde::Serialize> serde::Serialize for GlobalLockLocalData<T> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
     {
-        self.data.serialize(serializer)
+        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
     }
 }
 
-impl<'a, T: Dist> Iterator for GlobalLockLocalData<'a, T> {
+pub struct GlobalLockLocalDataIter<'a, T: Dist> {
+    data: &'a [T],
+    index: usize,
+}
+
+impl<'a, T: Dist> Iterator for GlobalLockLocalDataIter<'a, T> {
     type Item = &'a T;
     fn next(&mut self) -> Option<Self::Item> {
         if self.index < self.data.len() {
@@ -209,11 +206,22 @@ impl<'a, T: Dist> Iterator for GlobalLockLocalData<'a, T> {
     }
 }
 
-impl<T: Dist> Deref for GlobalLockLocalData<'_, T> {
+impl<'a, T: Dist> IntoIterator for &'a GlobalLockLocalData<T> {
+    type Item = &'a T;
+    type IntoIter = GlobalLockLocalDataIter<'a, T>;
+    fn into_iter(self) -> Self::IntoIter {
+        GlobalLockLocalDataIter {
+            data: unsafe { self.array.array.local_as_mut_slice() },
+            index: 0,
+        }
+    }
+}
+
+impl<T: Dist> Deref for GlobalLockLocalData<T> {
     type Target = [T];
 
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -268,36 +276,37 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn read_local_data(&self) -> GlobalLockLocalData<'_, T> {
-    //     GlobalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: self.lock.read(),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
+    ///
+    /// Calling this function will result in a local read lock being captured on the array
+    ///
+    /// This function is blocking and intended to be called from non asynchronous contexts.
+    /// Calling within an asynchronous block may lead to deadlock.
+    ///
+    /// # One-sided Operation
+    /// Only returns local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_read_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_read_local_data(&self) -> GlobalLockLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            GlobalLockLocalData {
+                array: self_clone.clone(),
+                lock: self_clone.lock.clone(),
+                lock_guard: self_clone.lock.read().await,
+            }
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
@@ -314,52 +323,54 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.read_local_data());
+    /// let local_data = array.read_local_data().await;
     /// println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn read_local_data(&self) -> GlobalLockLocalData<'_, T> {
+    pub async fn read_local_data(&self) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
             array: self.clone(),
-            data: unsafe { self.array.local_as_mut_slice() },
-            index: 0,
             lock: self.lock.clone(),
             lock_guard: self.lock.read().await,
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the global write lock being captured on the array.
-    // ///.
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn write_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     let lock = self.lock.write();
-    //     let data = GlobalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: lock,
-    //     };
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     data
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
+    ///
+    /// Calling this function will result in the global write lock being captured on the array.
+    ///.
+    /// This function is blocking and intended to be called from non asynchronous contexts.
+    /// Calling within an asynchronous block may lead to deadlock.
+    ///
+    /// # One-sided Operation
+    /// Only returns (mutable) local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_write_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_write_local_data(&self) -> GlobalLockMutLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.write().await;
+            let data = GlobalLockMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
@@ -376,23 +387,23 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.write_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.write_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn write_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
+    pub async fn write_local_data(&self) -> GlobalLockMutLocalData<T> {
         let lock = self.lock.write().await;
         let data = GlobalLockMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    #[doc(alias("Collective"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
     ///
     /// Calling this function will result in the collective write lock being captured on the array
@@ -409,97 +420,54 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.collective_write_local_data());
+    /// let local_data = array.blocking_collective_write_local_data();
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
-    pub async fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<'_, T> {
+    pub fn blocking_collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.collective_write().await;
+            let data = GlobalLockCollectiveMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
+
+    #[doc(alias("Collective"))]
+    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
+    ///
+    /// Calling this function will result in the collective write lock being captured on the array
+    ///
+    /// # Collective Operation
+    /// All PEs associated with this array must enter the call, otherwise deadlock will occur.
+    /// Upon return every PE will hold a special collective write lock so that they can all access their local data simultaneous
+    /// This lock prevents any other access from occuring on the array until it is dropped on all the PEs.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
+    ///    let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    ///    let local_data = array.collective_write_local_data().await;
+    ///    println!("PE{my_pe} data: {local_data:?}");
+    /// });
+    ///```
+    pub async fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
         let lock = self.lock.collective_write().await;
         let data = GlobalLockCollectiveMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    // #[doc(hidden)] //todo create a custom macro to emit a warning saying use read_local_slice/write_local_slice intead
-    // pub(crate) async fn local_as_slice(&self) -> GlobalLockLocalData<'_, T> {
-    //     let the_lock = self.lock.read().await;
-    //     GlobalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: the_lock,
-    //     }
-    // }
-    // #[doc(hidden)]
-    // pub unsafe fn local_as_mut_slice(&self) -> &mut [T] {
-    //     self.array.local_as_mut_slice()
-    // }
-
-    // #[doc(hidden)]
-    // pub(crate) async fn local_as_mut_slice(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     let the_lock = self.lock.write().await;
-    //     let lock = GlobalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: the_lock,
-    //     };
-    //     // println!("have lla write lock");
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][GlobalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn local_data(&self) -> GlobalLockLocalData<'_, T> {
-    //     self.local_as_slice().await
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][GlobalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.mut_local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn mut_local_data(&self) -> GlobalLockMutLocalData<'_, T> {
-    //     self.local_as_mut_slice().await
-    // }
-
     #[doc(hidden)]
     pub unsafe fn __local_as_slice(&self) -> &[T] {
         self.array.local_as_mut_slice()
@@ -834,10 +802,7 @@ impl<T: Dist> LamellarArray<T> for GlobalLockArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -938,8 +903,9 @@ impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
-        let lock = self.array.block_on(self.lock.read());
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         Box::new(GlobalLockArrayReduceHandle {
             req: self.array.reduce_data(op, self.clone().into()),
             _lock_guard: lock,
@@ -950,20 +916,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for GlobalLockArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for GlobalLockArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index df71e494..0aeafafd 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -349,7 +349,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     type OnesidedIter = OneSidedIter<'static, T, Self>;
 
     fn dist_iter(&self) -> Self::DistIter {
-        let lock = self.array.block_on(self.lock.read());
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         self.barrier();
         GlobalLockDistIter {
             data: self.clone(),
@@ -361,7 +362,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock = self.array.block_on(self.lock.read());
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         GlobalLockLocalIter {
             data: self.clone(),
             lock: lock,
@@ -389,7 +391,11 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     type LocalIter = GlobalLockLocalIterMut<T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.collective_write()));
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(
+            self.array
+                .block_on(async move { lock.collective_write().await }),
+        );
         self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         GlobalLockDistIterMut {
@@ -402,7 +408,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter_mut(&self) -> Self::LocalIter {
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: GlobalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         GlobalLockLocalIterMut {
             data: self.clone(),
             lock: lock,
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 6b32ab6b..66c4434d 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -4,7 +4,6 @@ use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::{private::*, IterRequest};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
-use crate::scheduler::SchedulerQueue;
 use crate::Darc;
 
 use async_trait::async_trait;
@@ -80,7 +79,7 @@ impl LamellarAm for UpdateCntAm {
 }
 
 impl RemoteIterCountHandle {
-    async fn reduce_remote_counts(&self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
+    async fn reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
         self.team
             .exec_am_all(UpdateCntAm {
                 remote_cnt: local_cnt,
@@ -119,6 +118,7 @@ impl IterRequest for RemoteIterCountHandle {
             .sum::<usize>();
         self.team
             .scheduler
+            .clone()
             .block_on(self.reduce_remote_counts(count, cnt))
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 5cc8493b..c74a5ed5 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -81,7 +81,7 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     fn reduce_remote_vals(&self, local_val: Option<T>) -> Option<T> {
-        self.team.barrier();
+        self.team.tasking_barrier();
         let local_vals =
             UnsafeArray::<Option<T>>::new(&self.team, self.team.num_pes, Distribution::Block);
         unsafe {
diff --git a/src/array/iterator/one_sided_iterator/buffered.rs b/src/array/iterator/one_sided_iterator/buffered.rs
index 8a42178a..09650d96 100644
--- a/src/array/iterator/one_sided_iterator/buffered.rs
+++ b/src/array/iterator/one_sided_iterator/buffered.rs
@@ -1,7 +1,5 @@
 use crate::array::iterator::one_sided_iterator::*;
 use crate::array::LamellarArrayRequest;
-// use crate::LamellarArray;
-// use crate::scheduler::SchedulerQueue;
 use crate::memregion::OneSidedMemoryRegion;
 use std::collections::VecDeque;
 use std::ops::Deref;
@@ -91,8 +89,6 @@ impl<U> Deref for BufferedItem<U> {
     }
 }
 
-
-
 impl<I> OneSidedIterator for Buffered<I>
 where
     I: OneSidedIterator + Send,
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 670ee959..d52141ed 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -74,27 +74,26 @@ impl LocalLockByteArrayWeak {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct LocalLockMutLocalData<'a, T: Dist> {
-    data: &'a mut [T],
-    _index: usize,
-    _lock_guard: RwLockWriteGuardArc<Box<()>>,
+pub struct LocalLockMutLocalData<T: Dist> {
+    array: LocalLockArray<T>,
+    _lock_guard: RwLockWriteGuardArc<()>,
 }
 
-// impl<T: Dist> Drop for LocalLockMutLocalData<'_, T> {
+// impl<T: Dist> Drop for LocalLockMutLocalData<T> {
 //     fn drop(&mut self) {
 //         // println!("release lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
 //     }
 // }
 
-impl<T: Dist> Deref for LocalLockMutLocalData<'_, T> {
+impl<T: Dist> Deref for LocalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
-impl<T: Dist> DerefMut for LocalLockMutLocalData<'_, T> {
+impl<T: Dist> DerefMut for LocalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -107,28 +106,24 @@ impl<T: Dist> DerefMut for LocalLockMutLocalData<'_, T> {
 ///
 /// When the instance is dropped the lock is released.
 #[derive(Debug)]
-pub struct LocalLockLocalData<'a, T: Dist> {
+pub struct LocalLockLocalData<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
-    pub(crate) data: &'a [T],
-    index: usize,
     lock: LocalRwDarc<()>,
-    lock_guard: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock_guard: Arc<RwLockReadGuardArc<()>>,
 }
 
-impl<'a, T: Dist> Clone for LocalLockLocalData<'a, T> {
+impl<'a, T: Dist> Clone for LocalLockLocalData<T> {
     fn clone(&self) -> Self {
         // println!("getting read lock in LocalLockLocalData clone");
         LocalLockLocalData {
             array: self.array.clone(),
-            data: self.data,
-            index: self.index,
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-// impl<'a, T: Dist> Drop for LocalLockLocalData<'a, T> {
+// impl<'a, T: Dist> Drop for LocalLockLocalData<T> {
 //     fn drop(&mut self) {
 //         println!(
 //             "dropping read lock {:?}",
@@ -137,13 +132,13 @@ impl<'a, T: Dist> Clone for LocalLockLocalData<'a, T> {
 //     }
 // }
 
-// impl<'a, T: Dist> Drop for LocalLockMutLocalData<'a, T> {
+// impl<'a, T: Dist> Drop for LocalLockMutLocalData<T> {
 //     fn drop(&mut self) {
 //         println!("dropping write lock");
 //     }
 // }
 
-impl<'a, T: Dist> LocalLockLocalData<'a, T> {
+impl<'a, T: Dist> LocalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
     /// # Examples
@@ -159,27 +154,30 @@ impl<'a, T: Dist> LocalLockLocalData<'a, T> {
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
-    pub fn into_sub_data(self, start: usize, end: usize) -> LocalLockLocalData<'a, T> {
+    pub fn into_sub_data(self, start: usize, end: usize) -> LocalLockLocalData<T> {
         LocalLockLocalData {
-            array: self.array.clone(),
-            data: &self.data[start..end],
-            index: 0,
+            array: self.array.sub_array(start..end),
             lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
-impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<'a, T> {
+impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<T> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
     {
-        self.data.serialize(serializer)
+        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
     }
 }
 
-impl<'a, T: Dist> Iterator for LocalLockLocalData<'a, T> {
+pub struct LocalLockLocalDataIter<'a, T: Dist> {
+    data: &'a [T],
+    index: usize,
+}
+
+impl<'a, T: Dist> Iterator for LocalLockLocalDataIter<'a, T> {
     type Item = &'a T;
     fn next(&mut self) -> Option<Self::Item> {
         if self.index < self.data.len() {
@@ -191,11 +189,22 @@ impl<'a, T: Dist> Iterator for LocalLockLocalData<'a, T> {
     }
 }
 
-impl<T: Dist> Deref for LocalLockLocalData<'_, T> {
+impl<'a, T: Dist> IntoIterator for &'a LocalLockLocalData<T> {
+    type Item = &'a T;
+    type IntoIter = LocalLockLocalDataIter<'a, T>;
+    fn into_iter(self) -> Self::IntoIter {
+        LocalLockLocalDataIter {
+            data: unsafe { self.array.array.local_as_mut_slice() },
+            index: 0,
+        }
+    }
+}
+
+impl<T: Dist> Deref for LocalLockLocalData<T> {
     type Target = [T];
 
     fn deref(&self) -> &Self::Target {
-        self.data
+        unsafe { self.array.array.local_as_mut_slice() }
     }
 }
 
@@ -251,34 +260,35 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn read_local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     // println!("getting read lock in read_local_local");
-    //     LocalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: Arc::new(self.lock.read()),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
+    ///
+    /// Calling this function will result in a local read lock being captured on the array
+    ///
+    /// # One-sided Operation
+    /// Only returns local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_read_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_read_local_data(&self) -> LocalLockLocalData<T> {
+        // println!("getting read lock in read_local_local");
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            LocalLockLocalData {
+                array: self_clone.clone(),
+                lock: self_clone.lock.clone(),
+                lock_guard: Arc::new(self_clone.lock.read().await),
+            }
+        })
+    }
 
     /// TODO: UPDATE
     /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
@@ -293,51 +303,53 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.read_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.read_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn read_local_data(&self) -> LocalLockLocalData<'_, T> {
+    pub async fn read_local_data(&self) -> LocalLockLocalData<T> {
         // println!("getting read lock in read_local_local");
         LocalLockLocalData {
             array: self.clone(),
-            data: unsafe { self.array.local_as_mut_slice() },
-            index: 0,
             lock: self.lock.clone(),
             lock_guard: Arc::new(self.lock.read().await),
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the local write lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn write_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     // println!("getting write lock in write_local_data");
-    //     let lock = self.lock.write();
-    //     let data = LocalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: lock,
-    //     };
-    //     // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     data
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
+    ///
+    /// Calling this function will result in the local write lock being captured on the array
+    ///
+    /// # One-sided Operation
+    /// Only returns (mutable) local data on the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let local_data = array.blocking_write_local_data();
+    /// println!("PE{my_pe} data: {local_data:?}");
+    ///```
+    pub fn blocking_write_local_data(&self) -> LocalLockMutLocalData<T> {
+        // println!("getting write lock in write_local_data");
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            let lock = self_clone.lock.write().await;
+            let data = LocalLockMutLocalData {
+                array: self_clone,
+                _lock_guard: lock,
+            };
+            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+            data
+        })
+    }
 
     #[doc(alias("One-sided", "onesided"))]
     /// TODO: UPDATE
@@ -353,150 +365,24 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// world.clone().block_on(async move {
+    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.block_on(array.write_local_data());
-    /// println!("PE{my_pe} data: {local_data:?}");
+    ///     let local_data = array.write_local_data().await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// });
     ///```
-    pub async fn write_local_data(&self) -> LocalLockMutLocalData<'_, T> {
+    pub async fn write_local_data(&self) -> LocalLockMutLocalData<T> {
         // println!("getting write lock in write_local_data");
         let lock = self.lock.write().await;
         let data = LocalLockMutLocalData {
-            data: unsafe { self.array.local_as_mut_slice() },
-            _index: 0,
+            array: self.clone(),
             _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
     }
 
-    // #[doc(hidden)] //todo create a custom macro to emit a warning saying use read_local_slice/write_local_slice intead
-    // pub(crate) async fn local_as_slice(&self) -> LocalLockLocalData<'_, T> {
-    //     // println!("getting read lock in local_as_slice");
-    //     let lock = LocalLockLocalData {
-    //         array: self.clone(),
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         index: 0,
-    //         lock: self.lock.clone(),
-    //         lock_guard: Arc::new(self.lock.read().await),
-    //     };
-    //     // println!("got read lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-    // #[doc(hidden)]
-    // pub unsafe fn local_as_mut_slice(&self) -> &mut [T] {
-    //     self.array.local_as_mut_slice()
-    // }
-
-    // #[doc(hidden)]
-    // pub(crate) async fn local_as_mut_slice(&self) -> LocalLockMutLocalData<'_, T> {
-    //     // println!("getting write lock in local_as_mut_slice");
-    //     let the_lock = self.lock.write().await;
-    //     let lock = LocalLockMutLocalData {
-    //         data: unsafe { self.array.local_as_mut_slice() },
-    //         _index: 0,
-    //         _lock_guard: the_lock,
-    //     };
-    //     // println!("have lla write lock");
-    //     // println!("got write lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //     lock
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][LocalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     self.local_as_slice()
-    // }
-
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [read_local_data()][LocalLockArray::read_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn local_data(&self) -> LocalLockLocalData<'_, T> {
-    //     self.read_local_data().await
-    // }
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][LocalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.mut_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn mut_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     self.local_as_mut_slice()
-    // }
-
-    // /// TODO: UPDATE
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// While this call is safe, it may be more clear to use the [write_local_data()][LocalLockArray::write_local_data] function.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.block_on(array.mut_local_data());
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub async fn mut_local_data(&self) -> LocalLockMutLocalData<'_, T> {
-    //     self.write_local_data().await
-    // }
-
     #[doc(hidden)]
     pub unsafe fn __local_as_slice(&self) -> &[T] {
         self.array.local_as_mut_slice()
@@ -831,10 +717,7 @@ impl<T: Dist> LamellarArray<T> for LocalLockArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -917,7 +800,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
 #[doc(hidden)]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
     req: Box<dyn LamellarRequest<Output = T>>,
-    _lock_guard: RwLockReadGuardArc<Box<()>>,
+    _lock_guard: RwLockReadGuardArc<()>,
 }
 
 #[async_trait]
@@ -932,8 +815,9 @@ impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
-        let lock = self.array.block_on(self.lock.read());
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = self.array.block_on(async move { lock.read().await });
         Box::new(LocalLockArrayReduceHandle {
             req: self.array.reduce_data(op, self.clone().into()),
             _lock_guard: lock,
@@ -944,20 +828,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for LocalLockArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for LocalLockArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index c231e4e4..983c269e 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -22,7 +22,7 @@ use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock: Arc<RwLockReadGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -56,7 +56,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIter<'a, T> {
 #[derive(Clone)]
 pub struct LocalLockLocalIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock: Arc<RwLockReadGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -183,7 +183,7 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalIter<'static, T>
 
 pub struct LocalLockDistIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<Box<()>>>,
+    lock: Arc<RwLockWriteGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -215,7 +215,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIterMut<'a, T> {
 
 pub struct LocalLockLocalIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<Box<()>>>,
+    lock: Arc<RwLockWriteGuardArc<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -353,7 +353,9 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     type OnesidedIter = OneSidedIter<'static, T, Self>;
 
     fn dist_iter(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.read()));
+        // let the_array: LocalLockArray<T> = self.clone();
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
         self.barrier();
         LocalLockDistIter {
             data: self.clone(),
@@ -365,7 +367,8 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock = Arc::new(self.array.block_on(self.lock.read()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
         LocalLockLocalIter {
             data: self.clone(),
             lock: lock,
@@ -393,7 +396,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     type LocalIter = LocalLockLocalIterMut<'static, T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         LocalLockDistIterMut {
@@ -407,7 +411,8 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
 
     fn local_iter_mut(&self) -> Self::LocalIter {
         // println!("trying to get write lock for iter");
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
+        let lock: LocalRwDarc<()> = self.lock.clone();
+        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         // println!("got write lock for iter");
         LocalLockLocalIterMut {
             data: self.clone(),
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 7e0e046b..590f9b48 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1134,10 +1134,7 @@ impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -1207,7 +1204,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for NativeAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for NativeAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -1216,20 +1213,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for NativeAtomicArray<T>
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for NativeAtomicArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for NativeAtomicArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/operations.rs b/src/array/operations.rs
index d04041d1..104b37a8 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -6,7 +6,7 @@ use crate::array::local_lock_atomic::*;
 use crate::array::native_atomic::*;
 use crate::array::{AmDist, Dist, LamellarArrayRequest, LamellarEnv, LamellarWriteArray};
 use crate::lamellar_request::LamellarRequest;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use crate::LamellarTeamRT;
 
 pub(crate) mod access;
@@ -230,8 +230,8 @@ pub enum OpInputEnum<'a, T: Dist> {
     Vec(Vec<T>),
     NativeAtomicLocalData(NativeAtomicLocalData<T>),
     GenericAtomicLocalData(GenericAtomicLocalData<T>),
-    LocalLockLocalData(LocalLockLocalData<'a, T>),
-    GlobalLockLocalData(GlobalLockLocalData<'a, T>),
+    LocalLockLocalData(LocalLockLocalData<T>),
+    GlobalLockLocalData(GlobalLockLocalData<T>),
     // Iter(Box<dyn Iterator<Item = T> + 'a>),
 
     // while it would be convienient to directly use the following, doing so
@@ -305,52 +305,47 @@ impl<'a, T: Dist> OpInputEnum<'_, T> {
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn into_vec_chunks(self, chunk_size: usize) -> Vec<Vec<T>> {
         match self {
-            OpInputEnum::Val(v) => vec![vec![v]],
+            OpInputEnum::Val(v) =>vec![vec![v]],
             OpInputEnum::Slice(s) => s.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
             OpInputEnum::Vec(v) => v.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
             OpInputEnum::NativeAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                a.iter()
-                    .enumerate()
-                    .filter_map(move |(i, elem)| {
-                        data.push(elem.load());
-                        if data.len() == chunk_size || i == a.len() - 1 {
-                            let mut new_data = Vec::with_capacity(chunk_size);
-                            std::mem::swap(&mut data, &mut new_data);
-                            Some(new_data)
-                        } else {
-                            None
-                        }
-                    })
-                    .collect()
+                a.iter().enumerate().filter_map(move |(i, elem)| {
+                    data.push(elem.load());
+                    if data.len() == chunk_size || i == a.len() - 1 {
+                        let mut new_data = Vec::with_capacity(chunk_size);
+                        std::mem::swap(&mut data, &mut new_data);
+                        Some(new_data)
+                    } else {
+                        None
+                    }
+                }).collect()
             }
             OpInputEnum::GenericAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                a.iter()
-                    .enumerate()
-                    .filter_map(move |(i, elem)| {
-                        data.push(elem.load());
-                        if data.len() == chunk_size || i == a.len() - 1 {
-                            let mut new_data = Vec::with_capacity(chunk_size);
-                            std::mem::swap(&mut data, &mut new_data);
-                            Some(new_data)
-                        } else {
-                            None
-                        }
-                    })
-                    .collect()
+                a.iter().enumerate().filter_map(move |(i, elem)| {
+                    data.push(elem.load());
+                    if data.len() == chunk_size || i == a.len() - 1 {
+                        let mut new_data = Vec::with_capacity(chunk_size);
+                        std::mem::swap(&mut data, &mut new_data);
+                        Some(new_data)
+                    } else {
+                        None
+                    }
+                }).collect()
             }
             OpInputEnum::LocalLockLocalData(a) => {
                 a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
             }
             OpInputEnum::GlobalLockLocalData(a) => {
                 a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
-            } // OpInputEnum::MemoryRegion(mr) => *unsafe { mr.as_slice() }
-              //     .expect("memregion not local")
-              //     .first()
-              //     .expect("memregion is empty"),
+            }
+            // OpInputEnum::MemoryRegion(mr) => *unsafe { mr.as_slice() }
+            //     .expect("memregion not local")
+            //     .first()
+            //     .expect("memregion is empty"),
         }
     }
 
@@ -687,8 +682,8 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
 //     }
 // }
 
-impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<'_, T> {
-    //#[tracing::instrument(skip_all)]
+impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
+    // #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
         let len = self.len();
         let mut iters = vec![];
@@ -727,8 +722,8 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<'_, T> {
     }
 }
 
-impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<'_, T> {
-    //#[tracing::instrument(skip_all)]
+impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
+    // #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
         let len = self.len();
         let mut iters = vec![];
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 99e7486a..f11eec7b 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -468,7 +468,7 @@ impl<T: Dist> From<LamellarByteArray> for ReadOnlyArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for ReadOnlyArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.array
             .reduce_data(op, self.clone().into())
             .into_future()
@@ -477,20 +477,20 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for ReadOnlyArray<T> {
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
     for ReadOnlyArray<T>
 {
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
     for ReadOnlyArray<T>
 {
-    fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
@@ -551,10 +551,7 @@ impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.array.block_on(f)
     }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 05c9422e..e32a3d36 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -12,7 +12,6 @@ use crate::darc::{Darc, DarcMode, WeakDarc};
 use crate::lamellae::AllocationType;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::{Dist, MemoryRegion};
-use crate::scheduler::SchedulerQueue;
 use crate::LamellarTaskGroup;
 use core::marker::PhantomData;
 use std::ops::Bound;
@@ -369,9 +368,10 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.wait_all();
         // println!("block on outstanding");
         // self.inner.data.print();
+        // let the_array: UnsafeArray<T> = self.clone();
+        let array_darc = self.inner.data.clone();
         self.team_rt()
-            .block_on(self.inner.data.block_on_outstanding(mode, 0));
-        // self.inner.data.print();
+            .block_on(array_darc.block_on_outstanding(mode, 1)); //one for this instance of the array
     }
 
     #[doc(alias = "Collective")]
@@ -811,10 +811,7 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.inner.data.team.scheduler.block_on(f)
     }
 
@@ -999,7 +996,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
     ///```
-    pub unsafe fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce_data(op, self.clone().into()).into_future()
     }
 
@@ -1035,7 +1032,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(unsafe{array.sum()}); //Safe in this instance as we have ensured no updates are currently happening
     /// // assert_eq!(array.len()*num_pes,sum);//this may or may not fail
     ///```
-    pub unsafe fn sum(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("sum")
     }
 
@@ -1072,7 +1069,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    pub unsafe fn prod(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("prod")
     }
 
@@ -1103,7 +1100,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let max = array.block_on(max_req);
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    pub unsafe fn max(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("max")
     }
 
@@ -1134,7 +1131,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let min = array.block_on(min_req);
     /// assert_eq!(0,min);
     ///```
-    pub unsafe fn min(&self) -> Pin<Box<dyn Future<Output = T>>> {
+    pub unsafe fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 873caf9d..b71be633 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -2,7 +2,6 @@ use crate::active_messaging::LamellarArcAm;
 use crate::array::operations::*;
 use crate::array::r#unsafe::UnsafeArray;
 use crate::array::{AmDist, Dist, LamellarArray, LamellarByteArray, LamellarEnv};
-use crate::scheduler::SchedulerQueue;
 use futures::Future;
 use parking_lot::Mutex;
 use std::any::TypeId;
@@ -394,13 +393,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
             let index_vec = index.to_vec();
+            let the_array: UnsafeArray<T> = self.clone();
             // println!("num_reqs {:?}",num_reqs);
             let the_array: UnsafeArray<T> = self.clone();
             self.inner
                 .data
                 .team
                 .scheduler
-                .submit_immediate_task2(async move {
+                .submit_immediate_task(async move {
                     let mut buffs =
                         vec![Vec::with_capacity(num_per_batch * index_size.len()); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
@@ -486,12 +486,12 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             start_i += len;
         }
 
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
+        while cnt.load(Ordering::SeqCst) < num_reqs {
+            self.inner.data.team.scheduler.exec_task();
+        }
         // println!("futures len {:?}",futures.lock().len());
         Box::pin(async move {
-            while cnt.load(Ordering::SeqCst) < num_reqs {
-                // self.inner.data.team.scheduler.exec_task();
-                async_std::task::yield_now().await;
-            }
             // println!("futures len {:?}",futures.lock().len());
             futures::future::join_all(futures.lock().drain(..)).await
         })
@@ -526,6 +526,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         let num_reqs = vals.len();
         // println!("num_reqs {:?}",num_reqs);
         let mut start_i = 0;
+        let scheduler = self.inner.data.team.scheduler.clone();
         for val in vals.drain(..) {
             let cnt2 = cnt.clone();
             let futures2 = futures.clone();
@@ -533,60 +534,54 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let len = val.len();
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
-            let val_chunks = val.into_vec_chunks(num_per_batch);
             let the_array: UnsafeArray<T> = self.clone();
-            self.inner
-                .data
-                .team
-                .scheduler
-                .submit_immediate_task2(async move {
-                    // let mut buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
-                    // let val_slice = val.as_slice();
-                    let mut inner_start_i = start_i;
-                    let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
-                        Vec::new();
-                    // val.as_vec_chunks(num_per_batch)
-                    val_chunks.into_iter().for_each(|val| {
-                        let val_len = val.len();
-                        let am = MultiValSingleIndex::new_with_vec(
-                            byte_array2.clone(),
-                            op,
-                            local_index,
-                            val,
-                        )
-                        .into_am::<T>(ret);
-                        let req = the_array
-                            .inner
-                            .data
-                            .team
-                            .exec_arc_am_pe::<R>(
-                                pe,
-                                am,
-                                Some(the_array.inner.data.array_counters.clone()),
-                            )
-                            .into_future();
-                        // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
-                        let res_buffer =
-                            (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
-                        reqs.push(Box::pin(async move { (req.await, res_buffer) }));
-                        inner_start_i += val_len;
-                    });
-                    // println!("reqs len {:?}",reqs.len());
-                    futures2.lock().extend(reqs);
-                    cnt2.fetch_add(1, Ordering::SeqCst);
-                    the_array
+            let val_chunks = val.into_vec_chunks(num_per_batch);
+            scheduler.submit_immediate_task(async move {
+                let mut inner_start_i = start_i;
+                let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
+                    Vec::new();
+                val_chunks.into_iter().for_each(|val| {
+                    let val_len = val.len();
+                    let am = MultiValSingleIndex::new_with_vec(
+                        byte_array2.clone(),
+                        op,
+                        local_index,
+                        val,
+                    )
+                    .into_am::<T>(ret);
+                    let req = the_array
                         .inner
                         .data
-                        .array_counters
-                        .outstanding_reqs
-                        .fetch_sub(1, Ordering::SeqCst);
-                    the_array.inner.data.team.dec_counters(1);
+                        .team
+                        .exec_arc_am_pe::<R>(
+                            pe,
+                            am,
+                            Some(the_array.inner.data.array_counters.clone()),
+                        )
+                        .into_future();
+                    // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
+                    let res_buffer =
+                        (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
+                    reqs.push(Box::pin(async move { (req.await, res_buffer) }));
+                    inner_start_i += val_len;
                 });
+                // println!("reqs len {:?}",reqs.len());
+                futures2.lock().extend(reqs);
+                cnt2.fetch_add(1, Ordering::SeqCst);
+                the_array
+                    .inner
+                    .data
+                    .array_counters
+                    .outstanding_reqs
+                    .fetch_sub(1, Ordering::SeqCst);
+                the_array.inner.data.team.dec_counters(1);
+            });
             start_i += len;
         }
+
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-            // async_std::task::yield_now().await;
         }
         // println!("futures len {:?}",futures.lock().len());
         Box::pin(async move {
@@ -639,7 +634,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                 .data
                 .team
                 .scheduler
-                .submit_immediate_task2(async move {
+                .submit_immediate_task(async move {
                     // println!("in immediate task");
                     let mut buffs = vec![Vec::with_capacity(bytes_per_batch); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
@@ -760,9 +755,9 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                 });
             start_i += len;
         }
+        // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-            // async_std::task::yield_now().await;
         }
         // println!("futures len {:?}", futures.lock().len());
         Box::pin(async move {
diff --git a/src/barrier.rs b/src/barrier.rs
index 185ad304..1ee005fc 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -1,10 +1,7 @@
 use crate::lamellae::{AllocationType, Lamellae, LamellaeRDMA};
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::SchedulerQueue;
-// use crate::lamellar_memregion::{SharedMemoryRegion,RegisteredMemoryRegion};
-use crate::memregion::MemoryRegion; //, RTMemoryRegionRDMA, RegisteredMemoryRegion};
+use crate::memregion::MemoryRegion;
 use crate::scheduler::Scheduler;
-// use rand::prelude::SliceRandom;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::time::Instant;
@@ -17,7 +14,7 @@ pub(crate) struct Barrier {
     n: usize, // dissemination factor
     num_rounds: usize,
     pub(crate) arch: Arc<LamellarArchRT>,
-    pub(crate) _scheduler: Arc<Scheduler>,
+    pub(crate) scheduler: Arc<Scheduler>,
     lamellae: Arc<Lamellae>,
     barrier_cnt: AtomicUsize,
     barrier_buf: Vec<MemoryRegion<usize>>,
@@ -85,17 +82,17 @@ impl Barrier {
         };
 
         let bar = Barrier {
-            my_pe: my_pe,
-            num_pes: num_pes,
-            n: n,
-            num_rounds: num_rounds,
-            arch: arch,
-            _scheduler: scheduler,
-            lamellae: lamellae,
+            my_pe,
+            num_pes,
+            n,
+            num_rounds,
+            arch,
+            scheduler,
+            lamellae,
             barrier_cnt: AtomicUsize::new(1),
             barrier_buf: buffs,
-            send_buf: send_buf,
-            panic: panic,
+            send_buf,
+            panic,
         };
         // bar.print_bar();
         bar
@@ -274,7 +271,7 @@ impl Barrier {
         if std::thread::current().id() == *crate::MAIN_THREAD {
             self.barrier_internal(|| {
                 // std::thread::yield_now();
-                self._scheduler.exec_task();
+                self.scheduler.exec_task();
             });
         } else {
             if let Ok(val) = std::env::var("LAMELLAR_BARRIER_WARNING") {
@@ -293,7 +290,7 @@ impl Barrier {
     // we actually want to be able to process other tasks while the barrier is active
     pub(crate) fn tasking_barrier(&self) {
         self.barrier_internal(|| {
-            self._scheduler.exec_task();
+            self.scheduler.exec_task();
         });
     }
 
diff --git a/src/darc.rs b/src/darc.rs
index 8ecbc5cb..1b7cb4b7 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -64,7 +64,6 @@ use crate::barrier::Barrier;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
-// use crate::scheduler::SchedulerQueue;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 #[doc(hidden)]
@@ -137,8 +136,8 @@ pub struct DarcInner<T> {
     drop: Option<fn(&mut T)>,
     valid: AtomicBool,
 }
-unsafe impl<T: Send> Send for DarcInner<T> {}
-unsafe impl<T: Sync> Sync for DarcInner<T> {}
+unsafe impl<T> Send for DarcInner<T> {} //we cant create DarcInners without going through the Darc interface which enforces  Sync+Send
+unsafe impl<T> Sync for DarcInner<T> {} //we cant create DarcInners without going through the Darc interface which enforces  Sync+Send
 
 /// Distributed atomic reference counter
 ///
@@ -192,8 +191,8 @@ pub struct Darc<T: 'static> {
     inner: *mut DarcInner<T>,
     src_pe: usize,
 }
-unsafe impl<T: Send> Send for Darc<T> {}
-unsafe impl<T: Sync> Sync for Darc<T> {}
+unsafe impl<T: Sync + Send> Send for Darc<T> {}
+unsafe impl<T: Sync + Send> Sync for Darc<T> {}
 
 impl<T> LamellarEnv for Darc<T> {
     fn my_pe(&self) -> usize {
@@ -956,15 +955,11 @@ impl<T> Darc<T> {
         Ok(d)
     }
 
-    pub(crate) async fn block_on_outstanding(&self, state: DarcMode, extra_cnt: usize) {
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
-            },
-            state,
-            extra_cnt,
-        )
-        .await;
+    pub(crate) async fn block_on_outstanding(self, state: DarcMode, extra_cnt: usize) {
+        let wrapped = WrappedInner {
+            inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
+        };
+        DarcInner::block_on_outstanding(wrapped, state, extra_cnt).await;
     }
 
     #[doc(alias = "Collective")]
@@ -1000,9 +995,10 @@ impl<T> Darc<T> {
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
         inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
         // println! {"[{:?}] darc[{:?}] into_localrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
-        let item = unsafe { Box::from_raw(inner.item as *mut T) };
+        let item = unsafe { *Box::from_raw(inner.item as *mut T) };
+
         let d = Darc {
-            inner: self.inner as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+            inner: self.inner as *mut DarcInner<Arc<RwLock<T>>>,
             src_pe: self.src_pe,
         };
         d.inner_mut()
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 2bda5a9b..cbb5cbaa 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -423,8 +423,8 @@ pub struct GlobalRwDarc<T: 'static> {
     pub(crate) darc: Darc<DistRwLock<T>>,
 }
 
-unsafe impl<T: Send> Send for GlobalRwDarc<T> {}
-unsafe impl<T: Sync> Sync for GlobalRwDarc<T> {}
+unsafe impl<T: Send> Send for GlobalRwDarc<T> {} //protected internally by rwlock
+unsafe impl<T: Send> Sync for GlobalRwDarc<T> {} //protected internally by rwlock
 
 impl<T> LamellarEnv for GlobalRwDarc<T> {
     fn my_pe(&self) -> usize {
@@ -547,13 +547,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let guard = world.block_on(counter.read());
-    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    /// drop(guard); //release the
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let guard = counter.read().await;
+    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///     drop(guard); //release the
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn read(&self) -> GlobalRwDarcReadGuard<T> {
         // println!("async read");
@@ -619,13 +621,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.write());
-    /// *guard += my_pe;
-    /// drop(guard); //release the
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.write().await;
+    ///     *guard += my_pe;
+    ///     drop(guard); //release the
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn write(&self) -> GlobalRwDarcWriteGuard<T> {
         // println!("async write");
@@ -688,13 +692,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.collective_write());
-    /// *guard += my_pe;
-    /// drop(guard); //release the lock
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
+    /// world.clone().block_on(async move {
+    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.collective_write().await;
+    ///     *guard += my_pe;
+    ///     drop(guard); //release the lock
+    ///     world.wait_all(); // wait for my active message to return
+    ///     world.barrier(); //at this point all updates will have been performed
+    /// });
     ///```
     pub async fn collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
         // println!("async write");
@@ -723,182 +729,183 @@ impl<T> GlobalRwDarc<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while any writer currently has access to the lock, but there may be other readers
-    // ///
-    // /// Returns ared this specific instance of the read lock will only be held by the calling PE (until it is dropped)
-    // /// Other PEs may have separately aquired read locks as well.
-    // ///
-    // ///
-    // /// # Noten RAII guard which will drop the read access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    // /// Once aqui
-    // /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_read]
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// // do interesting work
-    // /// let guard = counter.read(); //blocks current thread until aquired
-    // /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    // ///```
-    // pub fn read(&self) -> GlobalRwDarcReadGuard<T> {
-    //     // println!("read");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::Read,
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcReadGuard {
-    //         rwlock: self.darc.clone(),
-    //         marker: PhantomData,
-    //         local_cnt: Arc::new(AtomicUsize::new(1)),
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while any writer currently has access to the lock, but there may be other readers
+    ///
+    /// Returns ared this specific instance of the read lock will only be held by the calling PE (until it is dropped)
+    /// Other PEs may have separately aquired read locks as well.
+    ///
+    ///
+    /// # Noten RAII guard which will drop the read access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
+    /// Once aqui
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_read]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// // do interesting work
+    /// let guard = counter.blocking_read(); //blocks current thread until aquired
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///```
+    pub fn blocking_read(&self) -> GlobalRwDarcReadGuard<T> {
+        // println!("read");
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Launches an active message to gather a global write lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    // /// Once aquired the lock will only be held by the calling PE (until it is dropped)
-    // ///
-    // /// # Note
-    // /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_write]
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// // do interesting work
-    // /// let mut guard = counter.write(); //blocks current thread until aquired
-    // /// *guard += my_pe;
-    // ///```
-    // pub fn write(&self) -> GlobalRwDarcWriteGuard<T> {
-    //     // println!("write");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::Write,
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcWriteGuard {
-    //         rwlock: self.darc.clone(),
-    //         marker: PhantomData,
-    //     }
-    //     // inner.item().write(remote_rwlock_addr)
-    // }
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::Read,
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcReadGuard {
+            rwlock: self.darc.clone(),
+            marker: PhantomData,
+            local_cnt: Arc::new(AtomicUsize::new(1)),
+        }
+    }
 
-    // #[doc(alias("Collective"))]
-    // /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
-    // ///
-    // /// The current task will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # Collective Operation
-    // /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::*;
-    // ///
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let mut counter = self.counter.async_write().await; // await until we get the write lock
-    // ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // ///
-    // /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let mut guard = world.block_on(counter.collective_write());
-    // /// *guard += my_pe;
-    // /// drop(guard); //release the lock
-    // /// world.wait_all(); // wait for my active message to return
-    // /// world.barrier(); //at this point all updates will have been performed
-    // ///```
-    // pub fn collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-    //     // println!("async write");
-    //     let inner = self.inner();
-    //     let team = inner.team();
-    //     let remote_rwlock_addr = team.lamellae.remote_addr(
-    //         0,
-    //         inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-    //     );
-    //     let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
-    //     team.exec_am_pe_tg(
-    //         0,
-    //         LockAm {
-    //             rwlock_addr: remote_rwlock_addr,
-    //             orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-    //             lock_type: LockType::CollectiveWrite(collective_cnt),
-    //         },
-    //         Some(inner.am_counters()),
-    //     )
-    //     .get();
-    //     GlobalRwDarcCollectiveWriteGuard {
-    //         rwlock: self.darc.clone(),
-    //         collective_cnt: collective_cnt,
-    //         marker: PhantomData,
-    //     }
-    // }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Launches an active message to gather a global write lock associated with this GlobalRwDarc.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
+    /// Once aquired the lock will only be held by the calling PE (until it is dropped)
+    ///
+    /// # Note
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_write]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// // do interesting work
+    /// let mut guard = counter.blocking_write(); //blocks current thread until aquired
+    /// *guard += my_pe;
+    ///```
+    pub fn blocking_write(&self) -> GlobalRwDarcWriteGuard<T> {
+        // println!("write");
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::Write,
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcWriteGuard {
+            rwlock: self.darc.clone(),
+            marker: PhantomData,
+        }
+        // inner.item().write(remote_rwlock_addr)
+    }
+
+    #[doc(alias("Collective"))]
+    /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
+    ///
+    /// The current task will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # Collective Operation
+    /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::*;
+    ///
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let mut counter = self.counter.async_write().await; // await until we get the write lock
+    ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
+    ///     }
+    ///  }
+    /// //-------------
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// world.exec_am_all(DarcAm {counter: counter.clone()});
+    /// let mut guard = counter.blocking_collective_write();
+    /// *guard += my_pe;
+    /// drop(guard); //release the lock
+    /// world.wait_all(); // wait for my active message to return
+    /// world.barrier(); //at this point all updates will have been performed
+    ///```
+    pub fn blocking_collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
+        // println!("async write");
+        let inner = self.inner();
+        let team = inner.team();
+        let remote_rwlock_addr = team.lamellae.remote_addr(
+            0,
+            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
+        );
+        let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
+        team.exec_am_pe_tg(
+            0,
+            LockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::CollectiveWrite(collective_cnt),
+            },
+            Some(inner.am_counters()),
+        )
+        .get();
+        GlobalRwDarcCollectiveWriteGuard {
+            rwlock: self.darc.clone(),
+            collective_cnt: collective_cnt,
+            marker: PhantomData,
+        }
+    }
 }
 
 impl<T> GlobalRwDarc<T> {
@@ -1025,14 +1032,12 @@ impl<T> GlobalRwDarc<T> {
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
         let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<T>>>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
         d.inner_mut()
-            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(Box::new(
-                item,
-            ))))));
+            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
         LocalRwDarc { darc: d }
     }
 }
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 26557efb..f6b4c9e3 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -14,7 +14,6 @@ use crate::darc::global_rw_darc::{DistRwLock, GlobalRwDarc};
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
 use crate::lamellar_team::IntoLamellarTeam;
-use crate::scheduler::SchedulerQueue;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 /// A local read-write `Darc`
@@ -34,11 +33,11 @@ pub struct LocalRwDarc<T: 'static> {
         serialize_with = "localrw_serialize2",
         deserialize_with = "localrw_from_ndarc2"
     )]
-    pub(crate) darc: Darc<Arc<RwLock<Box<T>>>>, //we need to wrap WrLock in an Arc so we get access to ArcReadGuard and ArcWriteGuard
+    pub(crate) darc: Darc<Arc<RwLock<T>>>, //we need to wrap WrLock in an Arc so we get access to ArcReadGuard and ArcWriteGuard
 }
 
-unsafe impl<T: Send> Send for LocalRwDarc<T> {}
-unsafe impl<T: Sync> Sync for LocalRwDarc<T> {}
+unsafe impl<T: Send> Send for LocalRwDarc<T> {} //we are protecting internally with an WrLock
+unsafe impl<T: Send> Sync for LocalRwDarc<T> {} //we are protecting internally with an WrLock
 
 impl<T> LamellarEnv for LocalRwDarc<T> {
     fn my_pe(&self) -> usize {
@@ -84,7 +83,7 @@ impl<T> crate::active_messaging::DarcSerde for LocalRwDarc<T> {
 }
 
 impl<T> LocalRwDarc<T> {
-    fn inner(&self) -> &DarcInner<Arc<RwLock<Box<T>>>> {
+    fn inner(&self) -> &DarcInner<Arc<RwLock<T>>> {
         self.darc.inner()
     }
 
@@ -123,67 +122,10 @@ impl<T> LocalRwDarc<T> {
             self.inner()
         );
     }
+}
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Aquires a reader lock of this LocalRwDarc local to this PE.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while any writer currentl has access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the read access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    // ///
-    // /// # Note
-    // /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::prelude::*;
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let counter = self.counter.read(); //block until we get the write lock
-    // ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let guard = counter.read();
-    // /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    // ///```
-    // pub fn read(&self) -> RwLockReadGuardArc<Box<T>> {
-    //     // println!("trying to get read lock");
-    //     match self.darc.try_read_arc() {
-    //         Some(guard) => {
-    //             // println!("got read lock");
-    //             guard
-    //         }
-    //         None => {
-    //             // println!("did not get read lock");
-    //             let _lock_fut = self.darc.read_arc();
-    //             self.darc.team().scheduler.block_on(async move {
-    //                 // println!("async trying to get read lock");
-    //                 _lock_fut.await
-    //             })
-    //         }
-    //     }
-    // }
-
+impl<T: Sync + Send> LocalRwDarc<T> {
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
     /// Aquires a reader lock of this LocalRwDarc local to this PE.
     ///
     /// The current THREAD will be blocked until the lock has been acquired.
@@ -211,7 +153,7 @@ impl<T> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let counter = self.counter.read().await; //block until we get the write lock
+    ///         let counter = self.counter.read(); //block until we get the write lock
     ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
     ///     }
     ///  }
@@ -220,76 +162,67 @@ impl<T> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let guard = world.block_on(counter.read());
+    /// let guard = counter.blocking_read();
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
-    pub async fn read(&self) -> RwLockReadGuardArc<Box<T>> {
+    pub fn blocking_read(&self) -> RwLockReadGuardArc<T> {
+        let self_clone: LocalRwDarc<T> = self.clone();
+        self.darc
+            .team()
+            .block_on(async move { self_clone.darc.read_arc().await })
+    }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// TODO: UPDATE
+    /// Aquires a reader lock of this LocalRwDarc local to this PE.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while any writer currentl has access to the lock
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
+    ///
+    /// # Note
+    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::prelude::*;
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let counter = self.counter.read().await; //block until we get the write lock
+    ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
+    ///     }
+    ///  }
+    /// //-------------
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone().block_on(async move {
+    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let guard = counter.read().await;
+    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    /// });
+    ///```
+    pub async fn read(&self) -> RwLockReadGuardArc<T> {
         // println!("async trying to get read lock");
         let lock = self.darc.read_arc().await;
         // println!("got async read lock");
         lock
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Aquires the writer lock of this LocalRwDarc local to this PE.
-    // ///
-    // /// The current THREAD will be blocked until the lock has been acquired.
-    // ///
-    // /// This function will not return while another writer or any readers currently have access to the lock
-    // ///
-    // /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    // ///
-    // /// # One-sided Operation
-    // /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    // ///
-    // /// # Note
-    // /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    // ///
-    // /// # Examples
-    // ///
-    // ///```
-    // /// use lamellar::darc::prelude::*;
-    // /// use lamellar::active_messaging::prelude::*;
-    // /// #[lamellar::AmData(Clone)]
-    // /// struct DarcAm {
-    // ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    // /// }
-    // ///
-    // /// #[lamellar::am]
-    // /// impl LamellarAm for DarcAm {
-    // ///     async fn exec(self) {
-    // ///         let mut counter = self.counter.write(); //block until we get the write lock
-    // ///         **counter += 1;
-    // ///     }
-    // ///  }
-    // /// //-------------
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    // /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    // /// let mut guard = counter.write();
-    // /// **guard += my_pe;
-    // ///```
-    // pub fn write(&self) -> RwLockWriteGuardArc<Box<T>> {
-    //     // println!("trying to get write lock");
-    //     match self.darc.try_write_arc() {
-    //         Some(guard) => {
-    //             // println!("got write lock");
-    //             guard
-    //         }
-    //         None => {
-    //             // println!("did not get write lock");
-    //             let lock_fut = self.darc.write_arc();
-    //             self.darc.team().scheduler.block_on(async move {
-    //                 // println!("async trying to get write lock");
-    //                 lock_fut.await
-    //             })
-    //         }
-    //     }
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
     /// Aquires the writer lock of this LocalRwDarc local to this PE.
     ///
     /// The current THREAD will be blocked until the lock has been acquired.
@@ -317,7 +250,7 @@ impl<T> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let mut counter = self.counter.write().await; //block until we get the write lock
+    ///         let mut counter = self.counter.write(); //block until we get the write lock
     ///         **counter += 1;
     ///     }
     ///  }
@@ -326,10 +259,61 @@ impl<T> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// world.exec_am_all(DarcAm {counter: counter.clone()});
-    /// let mut guard = world.block_on(counter.write());
+    /// let mut guard = counter.blocking_write();
     /// **guard += my_pe;
     ///```
-    pub async fn write(&self) -> RwLockWriteGuardArc<Box<T>> {
+    pub fn blocking_write(&self) -> RwLockWriteGuardArc<T> {
+        // println!("trying to get write lock");
+        let self_clone: LocalRwDarc<T> = self.clone();
+        self.darc
+            .team()
+            .block_on(async move { self_clone.darc.write_arc().await })
+    }
+
+    #[doc(alias("One-sided", "onesided"))]
+    ///
+    /// Aquires the writer lock of this LocalRwDarc local to this PE.
+    ///
+    /// The current THREAD will be blocked until the lock has been acquired.
+    ///
+    /// This function will not return while another writer or any readers currently have access to the lock
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    ///
+    /// # One-sided Operation
+    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
+    ///
+    /// # Note
+    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    /// use lamellar::active_messaging::prelude::*;
+    /// #[lamellar::AmData(Clone)]
+    /// struct DarcAm {
+    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+    /// }
+    ///
+    /// #[lamellar::am]
+    /// impl LamellarAm for DarcAm {
+    ///     async fn exec(self) {
+    ///         let mut counter = self.counter.write().await; //block until we get the write lock
+    ///         **counter += 1;
+    ///     }
+    ///  }
+    /// //-------------
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// world.clone()block_on(async move{
+    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
+    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let mut guard = counter.write();
+    ///     **guard += my_pe;
+    /// })
+    ///```
+    pub async fn write(&self) -> RwLockWriteGuardArc<T> {
         // println!("async trying to get write lock");
         let lock = self.darc.write_arc().await;
         // println!("got async write lock");
@@ -359,11 +343,7 @@ impl<T> LocalRwDarc<T> {
     /// ```
     pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> Result<LocalRwDarc<T>, IdError> {
         Ok(LocalRwDarc {
-            darc: Darc::try_new(
-                team,
-                Arc::new(RwLock::new(Box::new(item))),
-                DarcMode::LocalRw,
-            )?,
+            darc: Darc::try_new(team, Arc::new(RwLock::new(item)), DarcMode::LocalRw)?,
         })
     }
 
@@ -378,12 +358,12 @@ impl<T> LocalRwDarc<T> {
     // }
 
     #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a regular [Darc]
+    /// Converts this LocalRwDarc into a [GlobalRwDarc]
     ///
     /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
     ///
     /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
@@ -395,9 +375,9 @@ impl<T> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = five.into_darc();
+    /// let five_as_globaldarc = five.into_globalrw();
     /// ```
-    pub fn into_darc(self) -> Darc<T> {
+    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -406,37 +386,40 @@ impl<T> LocalRwDarc<T> {
                 inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                     .expect("invalid darc pointer"),
             },
-            DarcMode::Darc,
+            DarcMode::GlobalRw,
             0,
         ));
         // println!("after block on outstanding");
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
-        let mut arc_item =
-            unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<Box<T>>>)).clone() };
-
-        let item: Box<T> = loop {
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+        let item: T = loop {
             arc_item = match Arc::try_unwrap(arc_item) {
                 Ok(item) => break item.into_inner(),
                 Err(arc_item) => arc_item,
             };
         };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<T>,
+            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
-        d.inner_mut().update_item(Box::into_raw(item));
-        d
+        d.inner_mut()
+            .update_item(Box::into_raw(Box::new(DistRwLock::new(
+                item,
+                self.inner().team(),
+            ))));
+        GlobalRwDarc { darc: d }
     }
+}
 
+impl<T: Send + Sync> LocalRwDarc<T> {
     #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a [GlobalRwDarc]
+    /// Converts this LocalRwDarc into a regular [Darc]
     ///
     /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
     ///
     /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
@@ -448,9 +431,9 @@ impl<T> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = five.into_globalrw();
+    /// let five_as_darc = five.into_darc();
     /// ```
-    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
+    pub fn into_darc(self) -> Darc<T> {
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -459,30 +442,27 @@ impl<T> LocalRwDarc<T> {
                 inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                     .expect("invalid darc pointer"),
             },
-            DarcMode::GlobalRw,
+            DarcMode::Darc,
             0,
         ));
         // println!("after block on outstanding");
         inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let mut arc_item =
-            unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<Box<T>>>)).clone() };
-        let item: Box<T> = loop {
+                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+
+        let item: T = loop {
             arc_item = match Arc::try_unwrap(arc_item) {
                 Ok(item) => break item.into_inner(),
                 Err(arc_item) => arc_item,
             };
         };
         let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
+            inner: self.darc.inner as *mut DarcInner<T>,
             src_pe: self.darc.src_pe,
             // phantom: PhantomData,
         };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(DistRwLock::new(
-                *item,
-                self.inner().team(),
-            ))));
-        GlobalRwDarc { darc: d }
+        d.inner_mut().update_item(Box::into_raw(Box::new(item))); //the darc will free this approriately
+        d
     }
 }
 
@@ -495,9 +475,17 @@ impl<T> Clone for LocalRwDarc<T> {
     }
 }
 
-impl<T: fmt::Display> fmt::Display for LocalRwDarc<T> {
+impl<T: fmt::Display + Sync + Send> fmt::Display for LocalRwDarc<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        fmt::Display::fmt(&**self.darc.team().scheduler.block_on(self.read()), f)
+        let lock: LocalRwDarc<T> = self.clone();
+        fmt::Display::fmt(
+            &self
+                .darc
+                .team()
+                .scheduler
+                .block_on(async move { lock.read().await }),
+            f,
+        )
     }
 }
 
@@ -525,10 +513,7 @@ impl<T: fmt::Display> fmt::Display for LocalRwDarc<T> {
 // }
 
 #[doc(hidden)]
-pub fn localrw_serialize2<S, T>(
-    localrw: &Darc<Arc<RwLock<Box<T>>>>,
-    s: S,
-) -> Result<S::Ok, S::Error>
+pub fn localrw_serialize2<S, T>(localrw: &Darc<Arc<RwLock<T>>>, s: S) -> Result<S::Ok, S::Error>
 where
     S: Serializer,
 {
@@ -539,9 +524,7 @@ where
 }
 
 #[doc(hidden)]
-pub fn localrw_from_ndarc2<'de, D, T>(
-    deserializer: D,
-) -> Result<Darc<Arc<RwLock<Box<T>>>>, D::Error>
+pub fn localrw_from_ndarc2<'de, D, T>(deserializer: D) -> Result<Darc<Arc<RwLock<T>>>, D::Error>
 where
     D: Deserializer<'de>,
 {
@@ -555,8 +538,8 @@ where
     Ok(Darc::from(ndarc))
 }
 
-// impl<T> From<Darc<Arc<RwLock<Box<T>>>>> for __NetworkDarc {
-//     fn from(darc: Darc<Arc<RwLock<Box<T>>>>) -> Self {
+// impl<T> From<Darc<Arc<RwLock<T>>>> for __NetworkDarc {
+//     fn from(darc: Darc<Arc<RwLock<T>>>) -> Self {
 //         // println!("rwdarc to net darc");
 //         // darc.print();
 //         let team = &darc.inner().team();
@@ -570,8 +553,8 @@ where
 //     }
 // }
 
-// impl<T> From<&Darc<Arc<RwLock<Box<T>>>>> for __NetworkDarc {
-//     fn from(darc: &Darc<Arc<RwLock<Box<T>>>>) -> Self {
+// impl<T> From<&Darc<Arc<RwLock<T>>>> for __NetworkDarc {
+//     fn from(darc: &Darc<Arc<RwLock<T>>>) -> Self {
 //         // println!("rwdarc to net darc");
 //         // darc.print();
 //         let team = &darc.inner().team();
@@ -585,14 +568,14 @@ where
 //     }
 // }
 
-// impl<T> From<__NetworkDarc> for Darc<Arc<RwLock<Box<T>>>> {
+// impl<T> From<__NetworkDarc> for Darc<Arc<RwLock<T>>> {
 //     fn from(ndarc: __NetworkDarc) -> Self {
 //         // println!("rwdarc from net darc");
 
 //         if let Some(lamellae) = LAMELLAES.read().get(&ndarc.backend) {
 //             let darc = Darc {
 //                 inner: lamellae.local_addr(ndarc.orig_world_pe, ndarc.inner_addr)
-//                     as *mut DarcInner<Arc<RwLock<Box<T>>>>,
+//                     as *mut DarcInner<Arc<RwLock<T>>>,
 //                 src_pe: ndarc.orig_team_pe,
 //                 // phantom: PhantomData,
 //             };
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 6a9b4404..56b2897f 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -2,7 +2,7 @@ use crate::lamellae::comm::*;
 use crate::lamellae::{
     Des, Lamellae, LamellaeComm, LamellaeRDMA, SerializedData, SerializedDataOps,
 };
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 
 use parking_lot::Mutex;
 
@@ -1435,7 +1435,8 @@ impl CommandQueue {
                                                 //     "[{:?}] recv_data submitting work",
                                                 //     std::thread::current().id(),
                                                 // );
-                                                scheduler2.submit_work(work_data, lamellae.clone());
+                                                scheduler2
+                                                    .submit_remote_am(work_data, lamellae.clone());
                                                 if cmd_cnt_clone.fetch_sub(1, Ordering::SeqCst) == 1
                                                 {
                                                     cq.send_free(src, cmd_buf_cmd);
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index ca76dc34..37bbcb2f 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -6,7 +6,7 @@ use crate::lamellae::{
     LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
diff --git a/src/lamellae/shmem_lamellae.rs b/src/lamellae/shmem_lamellae.rs
index 49e50716..b4008bcf 100644
--- a/src/lamellae/shmem_lamellae.rs
+++ b/src/lamellae/shmem_lamellae.rs
@@ -7,7 +7,7 @@ use crate::lamellae::{
     LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 601135f5..b49c5a0f 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -3,7 +3,7 @@ use crate::darc::Darc;
 use crate::lamellae::{Des, SerializedData};
 use crate::lamellar_arch::LamellarArchRT;
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::{Scheduler, SchedulerQueue};
+use crate::scheduler::Scheduler;
 use async_trait::async_trait;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index affdb00d..3df11061 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -4,7 +4,7 @@ use crate::lamellar_arch::LamellarArchRT;
 use crate::lamellar_request::*;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::{ReqId, Scheduler, SchedulerQueue};
+use crate::scheduler::{ReqId, Scheduler};
 use crate::Darc;
 
 use crate::active_messaging::registered_active_message::{AmId, AMS_EXECS, AMS_IDS, AM_ID_START};
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 7e4c928a..b2e30533 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -9,7 +9,7 @@ use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, LamellarMemoryRegion,
     MemoryRegion, RemoteMemoryRegion,
 };
-use crate::scheduler::{ReqId, Scheduler, SchedulerQueue};
+use crate::scheduler::{ReqId, Scheduler};
 #[cfg(feature = "nightly")]
 use crate::utils::ser_closure;
 
@@ -485,10 +485,7 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.barrier();
     }
 
-    fn block_on<F>(&self, f: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         // trace_span!("block_on").in_scope(||
@@ -925,7 +922,7 @@ impl LamellarTeamRT {
             // what does it mean if we drop a parent team while a sub_team is valid?
             if let None = &self.parent {
                 // println!("shutdown lamellae, going to shutdown scheduler");
-                self.scheduler.shutdown_threads();
+                self.scheduler.begin_shutdown();
                 self.put_dropped();
                 self.drop_barrier();
                 self.lamellae.shutdown();
@@ -1326,7 +1323,8 @@ impl LamellarTeamRT {
 
     pub(crate) fn block_on<F>(&self, f: F) -> F::Output
     where
-        F: Future,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 2f2f2b2b..97686819 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -6,7 +6,7 @@ use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, RemoteMemoryRegion,
 };
-use crate::scheduler::{create_scheduler, SchedulerQueue, SchedulerType};
+use crate::scheduler::{create_scheduler, ExecutorType};
 // use log::trace;
 
 //use tracing::*;
@@ -327,7 +327,7 @@ impl Drop for LamellarWorld {
 /// # Examples
 ///
 ///```
-/// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+/// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
 /// // can also use and of the module preludes
 /// // use lamellar::active_messaging::prelude::*;
 /// // use lamellar::array::prelude::*;
@@ -336,14 +336,14 @@ impl Drop for LamellarWorld {
 ///
 /// let world = LamellarWorldBuilder::new()
 ///                             .with_lamellae(Backend::Local)
-///                             .with_scheduler(SchedulerType::WorkStealing)
+///                             .with_executor(ExecutorType::LamellarWorkStealing)
 ///                             .build();
 ///```
 #[derive(Debug)]
 pub struct LamellarWorldBuilder {
     primary_lamellae: Backend,
     // secondary_lamellae: HashSet<Backend>,
-    scheduler: SchedulerType,
+    executor: ExecutorType,
     num_threads: usize,
 }
 
@@ -359,7 +359,7 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
     /// // can also use and of the module preludes
     /// // use lamellar::active_messaging::prelude::*;
     /// // use lamellar::array::prelude::*;
@@ -368,30 +368,35 @@ impl LamellarWorldBuilder {
     ///
     /// let world = LamellarWorldBuilder::new()
     ///                             .with_lamellae(Backend::Local)
-    ///                             .with_scheduler(SchedulerType::WorkStealing)
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing)
     ///                             .build();
     ///```
     //#[tracing::instrument(skip_all)]
     pub fn new() -> LamellarWorldBuilder {
         // simple_logger::init().unwrap();
         // trace!("New world builder");
-        let scheduler = match std::env::var("LAMELLAR_SCHEDULER") {
+        let mut executor = match std::env::var("LAMELLAR_EXECUTOR") {
             Ok(val) => {
-                let scheduler = val.parse::<usize>().unwrap();
-                if scheduler == 0 {
-                    SchedulerType::WorkStealing
+                let executor = val.parse::<usize>().unwrap();
+                if executor == 0 {
+                    ExecutorType::LamellarWorkStealing
                 }
                 // else if scheduler == 1 {
-                //     SchedulerType::NumaWorkStealing
+                //     ExecutorType::NumaWorkStealing
                 // } else if scheduler == 2 {
-                //     SchedulerType::NumaWorkStealing2
+                //     ExecutorType::NumaWorkStealing2
                 // }
                 else {
-                    SchedulerType::WorkStealing
+                    ExecutorType::LamellarWorkStealing
                 }
             }
-            Err(_) => SchedulerType::WorkStealing,
+            Err(_) => ExecutorType::LamellarWorkStealing,
         };
+        #[cfg(feature = "tokio-executor")]
+        {
+            executor = ExecutorType::Tokio;
+        }
+
         let num_threads = match std::env::var("LAMELLAR_THREADS") {
             Ok(n) => {
                 if let Ok(num_threads) = n.parse::<usize>() {
@@ -411,7 +416,7 @@ impl LamellarWorldBuilder {
         LamellarWorldBuilder {
             primary_lamellae: Default::default(),
             // secondary_lamellae: HashSet::new(),
-            scheduler: scheduler,
+            executor: executor,
             num_threads: num_threads,
         }
     }
@@ -444,24 +449,24 @@ impl LamellarWorldBuilder {
     // }
 
     #[doc(alias = "Collective")]
-    /// Specify the scheduler to use for this execution
+    /// Specify the executor to use for this execution
     ///
     /// # Collective Operation
-    /// While simply calling `with_scheduler` is not collective by itself (i.e. there is no internal barrier that would deadlock,
+    /// While simply calling `with_executor` is not collective by itself (i.e. there is no internal barrier that would deadlock,
     /// as the remote fabric is not initiated until after a call to `build`), it is necessary that the same
     /// parameters are used by all PEs that will exist in the world.
     ///
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,ExecutorType};
     ///
     /// let builder = LamellarWorldBuilder::new()
-    ///                             .with_scheduler(SchedulerType::WorkStealing);
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing);
     ///```
-    //#[tracing::instrument(skip_all)]
-    pub fn with_scheduler(mut self, sched: SchedulerType) -> LamellarWorldBuilder {
-        self.scheduler = sched;
+    // #[tracing::instrument(skip_all)]
+    pub fn with_executor(mut self, sched: ExecutorType) -> LamellarWorldBuilder {
+        self.executor = sched;
         self
     }
 
@@ -475,7 +480,7 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,ExecutorType};
     ///
     /// let builder = LamellarWorldBuilder::new()
     ///                             .set_num_workers(10);
@@ -495,11 +500,11 @@ impl LamellarWorldBuilder {
     /// # Examples
     ///
     ///```
-    /// use lamellar::{LamellarWorldBuilder,Backend,SchedulerType};
+    /// use lamellar::{LamellarWorldBuilder,Backend,ExecutorType};
     ///
     /// let world = LamellarWorldBuilder::new()
     ///                             .with_lamellae(Backend::Local)
-    ///                             .with_scheduler(SchedulerType::WorkStealing)
+    ///                             .with_executor(ExecutorType::LamellarWorkStealing)
     ///                             .build();
     ///```
     //#[tracing::instrument(skip_all)]
@@ -520,14 +525,15 @@ impl LamellarWorldBuilder {
         // println!("{:?}: init_fabric", timer.elapsed());
 
         // timer = std::time::Instant::now();
+
+        // we delay building the scheduler until we know the number of PEs (which is used for message aggregation)
+        // this could be lazyily provided but this is easy enough to do here
         let panic = Arc::new(AtomicU8::new(0));
         let sched_new = Arc::new(create_scheduler(
-            self.scheduler,
+            self.executor,
             num_pes,
             self.num_threads,
             panic.clone(),
-            my_pe,
-            // teams.clone(),
         ));
         // println!("{:?}: create_scheduler", timer.elapsed());
 
diff --git a/src/lib.rs b/src/lib.rs
index bf5771ad..099e09e5 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -294,7 +294,7 @@ pub use crate::lamellar_team::LamellarTeam;
 #[doc(hidden)]
 pub use crate::lamellar_team::{ArcLamellarTeam, LamellarTeamRT};
 pub use crate::lamellar_world::*;
-pub use crate::scheduler::SchedulerType;
+pub use crate::scheduler::ExecutorType;
 
 extern crate lamellar_impl;
 #[doc(hidden)]
diff --git a/src/scheduler.rs b/src/scheduler.rs
index c07ab9d1..97d85179 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -1,13 +1,33 @@
+use crate::active_messaging::batching::simple_batcher::SimpleBatcher;
+use crate::active_messaging::batching::team_am_batcher::TeamAmBatcher;
+use crate::active_messaging::batching::BatcherType;
+use crate::active_messaging::registered_active_message::RegisteredActiveMessages;
 use crate::active_messaging::*;
-use crate::lamellae::{Lamellae, SerializedData};
+use crate::lamellae::{Des, Lamellae, SerializedData};
 
 use enum_dispatch::enum_dispatch;
 use futures::Future;
-use std::sync::atomic::AtomicU8;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 
 pub(crate) mod work_stealing;
-use work_stealing::{WorkStealing, WorkStealingInner};
+use work_stealing::WorkStealing;
+
+#[cfg(feature = "tokio-executor")]
+pub(crate) mod tokio;
+#[cfg(feature = "tokio-executor")]
+use tokio::TokioRt;
+
+// ACTIVE ENUM
+// since atomic enums would be another dependecy
+
+#[repr(u8)]
+#[derive(Copy, Clone, Debug, serde::Serialize, serde::Deserialize)]
+pub(crate) enum SchedulerStatus {
+    Active,
+    Finished,
+    Panic,
+}
 
 // pub(crate) mod numa_work_stealing;
 // use numa_work_stealing::{NumaWorkStealing, NumaWorkStealingInner};
@@ -31,115 +51,254 @@ pub(crate) struct ReqId {
     pub(crate) sub_id: usize,
 }
 
-/// The available worker thread scheduling algorithms
 #[derive(Debug)]
-pub enum SchedulerType {
-    /// The default (and currently only) scheduler, performs workstealing across all worker threads
-    WorkStealing,
-    // NumaWorkStealing,
-    // NumaWorkStealing2,
+pub enum ExecutorType {
+    LamellarWorkStealing,
+    #[cfg(feature = "tokio-executor")]
+    Tokio,
+    // Dyn(impl LamellarExecutor),
 }
 
-#[enum_dispatch(AmeSchedulerQueue)]
-#[derive(Debug)]
-pub(crate) enum AmeScheduler {
-    WorkStealingInner,
-    // NumaWorkStealingInner,
-    // NumaWorkStealing2Inner,
-}
 #[enum_dispatch]
-pub(crate) trait AmeSchedulerQueue {
-    fn submit_am(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    );
-    fn submit_am_immediate(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    );
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        msg: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ); //serialized active message
+pub(crate) trait LamellarExecutor {
     fn submit_task<F>(&self, future: F)
     where
-        F: Future<Output = ()> + Send + 'static;
+        F: Future + Send + 'static,
+        F::Output: Send;
+
     fn submit_immediate_task<F>(&self, future: F)
     where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn exec_task(&self);
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        Self::submit_task(self, future)
+    }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future;
+    fn exec_task(&self) {
+        std::thread::yield_now();
+    }
+
+    fn block_on<F: Future>(&self, future: F) -> F::Output;
 
+    fn set_max_workers(&mut self, num_workers: usize);
+    fn num_workers(&self) -> usize;
     fn shutdown(&self);
-    fn shutdown_threads(&self);
     fn force_shutdown(&self);
-    fn active(&self) -> bool;
 }
 
-#[enum_dispatch(SchedulerQueue)]
+#[enum_dispatch(LamellarExecutor)]
 #[derive(Debug)]
-pub(crate) enum Scheduler {
-    WorkStealing(Arc<WorkStealing>),
-    // NumaWorkStealing,
-    // NumaWorkStealing2,
+pub(crate) enum Executor {
+    WorkStealing(WorkStealing),
+    #[cfg(feature = "tokio-executor")]
+    Tokio(TokioRt),
 }
-#[enum_dispatch]
-pub(crate) trait SchedulerQueue {
-    fn submit_am(&self, am: Am); //serialized active message
-    fn submit_am_immediate(&self, am: Am); //serialized active message
-    fn submit_work(&self, msg: SerializedData, lamellae: Arc<Lamellae>); //serialized active message
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_immediate_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static;
-    fn submit_task_node<F>(&self, future: F, node: usize)
+
+#[derive(Debug)]
+pub(crate) struct Scheduler {
+    executor: Arc<Executor>,
+    active_message_engine: RegisteredActiveMessages, //we can eventually abstract this around the ActiveMessageEngine trait but no need currently
+    num_ams: Arc<AtomicUsize>,
+    max_ams: Arc<AtomicUsize>,
+    num_tasks: Arc<AtomicUsize>,
+    max_tasks: Arc<AtomicUsize>,
+    am_stall_mark: Arc<AtomicUsize>,
+    status: Arc<AtomicU8>,
+    panic: Arc<AtomicU8>,
+}
+
+impl Scheduler {
+    pub(crate) fn new(
+        executor: Executor,
+        active_message_engine: RegisteredActiveMessages,
+        am_stall_mark: Arc<AtomicUsize>,
+        status: Arc<AtomicU8>,
+        panic: Arc<AtomicU8>,
+    ) -> Self {
+        Self {
+            executor: Arc::new(executor),
+            active_message_engine,
+            num_ams: Arc::new(AtomicUsize::new(0)),
+            max_ams: Arc::new(AtomicUsize::new(0)),
+            num_tasks: Arc::new(AtomicUsize::new(0)),
+            max_tasks: Arc::new(AtomicUsize::new(0)),
+            am_stall_mark,
+            status,
+            panic,
+        }
+    }
+    pub(crate) fn submit_am(&self, am: Am) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            ame.process_msg(am, executor, am_stall_mark, false).await;
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_task(am_future);
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn submit_am_immediate(&self, am: Am) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            ame.process_msg(am, executor, am_stall_mark, false).await;
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_immediate_task(am_future);
+    }
+
+    pub(crate) fn submit_remote_am(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let ame = self.active_message_engine.clone();
+        let executor = self.executor.clone();
+        let am_future = async move {
+            num_ams.fetch_add(1, Ordering::Relaxed);
+            max_ams.fetch_add(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            if let Some(header) = data.deserialize_header() {
+                let msg = header.msg;
+                ame.exec_msg(msg, data, lamellae, executor).await;
+            } else {
+                data.print();
+                panic!("should i be here?");
+            }
+            num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        };
+        self.executor.submit_task(am_future);
+    }
+
+    pub(crate) fn submit_task<F>(&self, task: F)
     where
-        F: Future<Output = ()> + Send + 'static;
-    fn exec_task(&self);
-    fn block_on<F>(&self, future: F) -> F::Output
+        F: Future<Output = ()> + Send + 'static,
+    {
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        let future = async move {
+            num_tasks.fetch_add(1, Ordering::Relaxed);
+            max_tasks.fetch_add(1, Ordering::Relaxed);
+            task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+        };
+        self.executor.submit_task(future);
+    }
+
+    pub(crate) fn submit_immediate_task<F>(&self, task: F)
     where
-        F: Future;
-    fn shutdown(&self);
-    fn shutdown_threads(&self);
-    fn force_shutdown(&self);
-    fn active(&self) -> bool;
-    fn num_workers(&self) -> usize;
+        F: Future<Output = ()> + Send + 'static,
+    {
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        let future = async move {
+            num_tasks.fetch_add(1, Ordering::Relaxed);
+            max_tasks.fetch_add(1, Ordering::Relaxed);
+            task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+        };
+        self.executor.submit_immediate_task(future);
+    }
+
+    pub(crate) fn exec_task(&self) {
+        if std::thread::current().id() == *crate::MAIN_THREAD {
+            self.executor.exec_task();
+        } else {
+            std::thread::yield_now();
+        }
+    }
+
+    pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
+        self.executor.block_on(task)
+    }
+
+    #[allow(dead_code)]
+    pub(crate) fn get_executor(&self) -> Arc<Executor> {
+        self.executor.clone()
+    }
+
+    pub(crate) fn active(&self) -> bool {
+        self.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+            || self.num_tasks.load(Ordering::SeqCst) > 3 // the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
+    }
+    pub(crate) fn num_workers(&self) -> usize {
+        self.executor.num_workers()
+    }
+    pub(crate) fn begin_shutdown(&self) {
+        self.status
+            .store(SchedulerStatus::Finished as u8, Ordering::SeqCst);
+    }
+    pub(crate) fn shutdown(&self) {
+        let mut timer = std::time::Instant::now();
+        while self.panic.load(Ordering::SeqCst) == 0 && self.num_tasks.load(Ordering::Relaxed) > 3
+        //TODO maybe this should be > 2
+        {
+            //the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
+            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                println!(
+                    "shurtdown timeout, tasks remaining: {:?} panic: {:?}",
+                    self.num_tasks.load(Ordering::Relaxed),
+                    self.panic.load(Ordering::SeqCst),
+                );
+                timer = std::time::Instant::now();
+            }
+            std::thread::yield_now()
+        }
+        self.executor.shutdown();
+    }
+    pub(crate) fn force_shutdown(&self) {
+        self.status
+            .store(SchedulerStatus::Panic as u8, Ordering::SeqCst);
+        self.executor.force_shutdown();
+    }
 }
 
 pub(crate) fn create_scheduler(
-    sched: SchedulerType,
+    executor: ExecutorType,
     num_pes: usize,
     num_workers: usize,
     panic: Arc<AtomicU8>,
-    my_pe: usize,
-    // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
 ) -> Scheduler {
-    match sched {
-        SchedulerType::WorkStealing => Scheduler::WorkStealing(Arc::new(
-            work_stealing::WorkStealing::new(num_pes, num_workers, panic, my_pe),
-        )), // SchedulerType::NumaWorkStealing => {
-            //     Scheduler::NumaWorkStealing(numa_work_stealing::NumaWorkStealing::new(num_pes))
-            // }
-            // SchedulerType::NumaWorkStealing2 => {
-            //     Scheduler::NumaWorkStealing2(numa_work_stealing2::NumaWorkStealing2::new(num_pes))
-            // }
-    }
+    let am_stall_mark = Arc::new(AtomicUsize::new(0));
+    let status = Arc::new(AtomicU8::new(SchedulerStatus::Active as u8));
+    let executor = match executor {
+        ExecutorType::LamellarWorkStealing => {
+            WorkStealing::new(num_workers, status.clone(), panic.clone()).into()
+        }
+        #[cfg(feature = "tokio-executor")]
+        ExecutorType::Tokio => TokioRt::new(num_workers).into(),
+    };
+
+    let batcher = match std::env::var("LAMELLAR_BATCHER") {
+        Ok(n) => {
+            let n = n.parse::<usize>().unwrap();
+            if n == 1 {
+                BatcherType::Simple(SimpleBatcher::new(num_pes, am_stall_mark.clone()))
+            } else {
+                BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone()))
+            }
+        }
+        Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone())),
+    };
+    Scheduler::new(
+        executor,
+        RegisteredActiveMessages::new(batcher),
+        am_stall_mark,
+        status,
+        panic,
+    )
 }
diff --git a/src/scheduler/numa_work_stealing.rs b/src/scheduler/numa_work_stealing.rs
index 7e94e6ce..c2f5a043 100644
--- a/src/scheduler/numa_work_stealing.rs
+++ b/src/scheduler/numa_work_stealing.rs
@@ -235,10 +235,7 @@ impl AmeSchedulerQueue for NumaWorkStealingInner {
         task.detach();
     }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
+    fn block_on<F: Future>(&self, future: F) -> F::Output {
         let work_inj = self.work_inj[self
             .local_work_inj
             .get_or(|| AtomicUsize::new(0))
@@ -503,7 +500,7 @@ impl NumaWorkStealingInner {
 
 #[derive(Debug)]
 pub(crate) struct NumaWorkStealing {
-    inner: Arc<AmeScheduler>,
+    inner: &(impl SchedulerQueue + Sync + std::fmt::Debug),
     ame: Arc<ActiveMessageEngineType>,
 }
 impl NumaWorkStealing {
diff --git a/src/scheduler/numa_work_stealing2.rs b/src/scheduler/numa_work_stealing2.rs
index 8f25b182..ec82c3ef 100644
--- a/src/scheduler/numa_work_stealing2.rs
+++ b/src/scheduler/numa_work_stealing2.rs
@@ -431,7 +431,7 @@ thread_local! {
 
 #[derive(Debug)]
 pub(crate) struct NumaWorkStealing2 {
-    inners: Vec<Arc<AmeScheduler>>,
+    inners: Vec<&(impl SchedulerQueue + Sync + std::fmt::Debug)>,
     ames: Vec<Arc<ActiveMessageEngineType>>,
     node_mask: usize,
 }
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
new file mode 100644
index 00000000..f9e14ac1
--- /dev/null
+++ b/src/scheduler/tokio.rs
@@ -0,0 +1,88 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+
+use tokio::runtime::Runtime;
+
+use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::Worker;
+use futures::Future;
+use futures_lite::FutureExt;
+use rand::prelude::*;
+use std::panic;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc; //, Weak};
+use std::thread;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+pub(crate) struct TokioRt {
+    max_num_threads: usize,
+    rt: Runtime,
+}
+
+impl LamellarExecutor for TokioRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl TokioRt {
+    pub(crate) fn new(num_workers: usize) -> TokioRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        TokioRt {
+            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            rt: tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(num_workers + 1)
+                .enable_all()
+                .build()
+                .unwrap(),
+        }
+    }
+}
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index 698a3c9e..c61596d9 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -1,10 +1,4 @@
-use crate::active_messaging::{ActiveMessageEngine, ActiveMessageEngineType, Am};
-use crate::lamellae::{Des, Lamellae, SerializedData};
-use crate::scheduler::batching::simple_batcher::SimpleBatcher;
-use crate::scheduler::batching::team_am_batcher::TeamAmBatcher;
-use crate::scheduler::batching::BatcherType;
-use crate::scheduler::registered_active_message::RegisteredActiveMessages;
-use crate::scheduler::{AmeScheduler, AmeSchedulerQueue, SchedulerQueue};
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
 
 //use tracing::*;
 
@@ -13,28 +7,14 @@ use core_affinity::CoreId;
 use crossbeam::deque::Worker;
 use futures::Future;
 use futures_lite::FutureExt;
-// use parking_lot::Mutex;
 use rand::prelude::*;
-// use std::collections::BTreeMap;
 use std::panic;
 use std::process;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc; //, Weak};
 use std::thread;
-// use std::time::Instant;
-// use std::time::Instant;
-
-const ACTIVE: u8 = 0;
-const FINISHED: u8 = 1;
-const PANIC: u8 = 2;
 
 static TASK_ID: AtomicUsize = AtomicUsize::new(0);
-
-// static LAST_PRINTED_TASKS: AtomicUsize = AtomicUsize::new(0);
-
-// static OUTSTANDING_REQS:  Mutex<HashMap<usize,usize>> = parking_lot::const_mutex(HashMap::new());
-// lazy_static!{ static ref OUTSTANDING_REQS: Mutex<BTreeMap<usize,usize>> = Mutex::new(BTreeMap::new()); }
-
 #[derive(Debug)]
 pub(crate) struct WorkStealingThread {
     imm_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
@@ -42,7 +22,7 @@ pub(crate) struct WorkStealingThread {
     work_stealers: Vec<crossbeam::deque::Stealer<Runnable<usize>>>,
     work_q: Worker<Runnable<usize>>,
     work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
     panic: Arc<AtomicU8>,
 }
 
@@ -51,41 +31,28 @@ impl WorkStealingThread {
     fn run(
         worker: WorkStealingThread,
         active_cnt: Arc<AtomicUsize>,
-        num_tasks: Arc<AtomicUsize>,
-        _max_tasks: Arc<AtomicUsize>,
+        // num_tasks: Arc<AtomicUsize>,
         id: CoreId,
-        _my_pe: usize,
     ) -> thread::JoinHandle<()> {
         let builder = thread::Builder::new().name("worker_thread".into());
         builder
             .spawn(move || {
                 // println!("TestSchdulerWorker thread running {:?} core: {:?}", std::thread::current().id(), id);
-                // let mut num_task_executed = 0;
-                // let _span = trace_span!("WorkStealingThread::run");
+                let _span = trace_span!("WorkStealingThread::run");
                 core_affinity::set_for_current(id);
                 active_cnt.fetch_add(1, Ordering::SeqCst);
                 let mut rng = rand::thread_rng();
                 let t = rand::distributions::Uniform::from(0..worker.work_stealers.len());
                 let mut timer = std::time::Instant::now();
-                // let mut cur_tasks = num_tasks.load(Ordering::SeqCst);
                 while worker.panic.load(Ordering::SeqCst) == 0
-                    && (worker.active.load(Ordering::SeqCst) == ACTIVE
-                        || !(worker.work_q.is_empty()
-                            && worker.work_inj.is_empty()
-                            && worker.imm_inj.is_empty())
-                        || num_tasks.load(Ordering::SeqCst) > 1)
+                    && (
+                        worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+                            || !(worker.work_q.is_empty()
+                                && worker.work_inj.is_empty()
+                                && worker.imm_inj.is_empty())
+                        // || num_tasks.load(Ordering::SeqCst) > 1
+                    )
                 {
-                    // let ot = Instant::now();
-                    // if cur_tasks != num_tasks.load(Ordering::SeqCst){
-                    //     println!(
-                    //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                    //         worker.work_q.len(),
-                    //         worker.work_inj.len(),
-                    //         num_tasks.load(Ordering::SeqCst)
-                    //     );
-                    //     cur_tasks = num_tasks.load(Ordering::SeqCst);
-
-                    // }
                     let omsg = if !worker.imm_inj.is_empty() {
                         worker.imm_inj.steal().success()
                     } else {
@@ -110,50 +77,32 @@ impl WorkStealingThread {
                     };
 
                     if let Some(runnable) = omsg {
-                        if worker.active.load(Ordering::SeqCst) == FINISHED
+                        if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
                             && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
                         {
                             println!("runnable {:?}", runnable);
                             println!(
-                                "work_q size {:?} work inj size {:?} num_tasks {:?}",
+                                "work_q size {:?} work inj size {:?}", // num_tasks {:?}",
                                 worker.work_q.len(),
                                 worker.work_inj.len(),
-                                num_tasks.load(Ordering::SeqCst)
+                                // num_tasks.load(Ordering::SeqCst)
                             );
                             timer = std::time::Instant::now();
                         }
-                        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-                        // if LAST_PRINTED_TASKS.load(Ordering::Relaxed) != num_tasks.load(Ordering::Relaxed) {
-                        //     LAST_PRINTED_TASKS.store(num_tasks.load(Ordering::Relaxed), Ordering::Relaxed);
-                        //     let work_stealers_lens = worker.work_stealers.iter().map(|x| x.len()).collect::<Vec<_>>();
-                        //     println!("[{:?}] (worker thread) Executing task {:?}, num_tasks: {:?} {:?} {:?} {work_stealers_lens:?} {:?}", std::thread::current().id(), runnable.metadata(),num_tasks.load(Ordering::Relaxed), worker.imm_inj.len(),worker.work_inj.len(), OUTSTANDING_REQS.lock());
-                        // }
                         runnable.run();
                     }
-                    if worker.active.load(Ordering::SeqCst) == FINISHED
+                    if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
                         && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
                         && (worker.work_q.len() > 0 || worker.work_inj.len() > 0)
                     {
                         println!(
-                            "work_q size {:?} work inj size {:?} num_tasks {:?}",
+                            "work_q size {:?} work inj size {:?} ", // num_tasks {:?}",
                             worker.work_q.len(),
                             worker.work_inj.len(),
-                            num_tasks.load(Ordering::SeqCst)
+                            // num_tasks.load(Ordering::SeqCst)
                         );
                         timer = std::time::Instant::now();
                     }
-                    // if timer.elapsed().as_secs_f64() > 10.0 {
-                    //     println!(
-                    //         "[{:?}] work_q size {:?} work inj size {:?} num_tasks {:?} {:?} {:?}",
-                    //         std::thread::current().id(),
-                    //         worker.work_q.len(),
-                    //         worker.work_inj.len(),
-                    //         num_tasks.load(Ordering::SeqCst),
-                    //         worker.active.load(Ordering::SeqCst) == FINISHED,
-                    //         OUTSTANDING_REQS.lock()
-                    //     );
-                    //     timer = std::time::Instant::now()
-                    // }
                     std::thread::yield_now();
                 }
                 active_cnt.fetch_sub(1, Ordering::SeqCst);
@@ -164,356 +113,98 @@ impl WorkStealingThread {
 }
 
 #[derive(Debug)]
-pub(crate) struct WorkStealingInner {
+pub(crate) struct WorkStealing {
+    max_num_threads: usize,
     threads: Vec<thread::JoinHandle<()>>,
     imm_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
     work_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
     work_stealers: Vec<crossbeam::deque::Stealer<Runnable<usize>>>,
     work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
     active_cnt: Arc<AtomicUsize>,
-    num_tasks: Arc<AtomicUsize>,
-    max_tasks: Arc<AtomicUsize>,
-    stall_mark: Arc<AtomicUsize>,
     panic: Arc<AtomicU8>,
 }
 
-impl AmeSchedulerQueue for WorkStealingInner {
-    //#[tracing::instrument(skip_all)]
-    fn submit_am(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        // println!("[{:?}] submitting_req", std::thread::current().id());
-        // println!("submit req {:?}",self.num_tasks.load(Ordering::Relaxed)+1);
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit am exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),_cur_task);
-            ame.process_msg(am, scheduler, stall_mark, false).await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit am done {:?} {:?} TaskId: {:?} ", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),_cur_task);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {
-            Builder::new()
-                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn_unchecked(future, schedule)
-        };
-        // println!("[{:?}] submit am schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-        runnable.schedule();
-        task.detach();
-    }
-
-    //#[tracing::instrument(skip_all)]
-    fn submit_am_immediate(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        // println!("submitting_req");
-        // println!("submit req {:?}",self.num_tasks.load(Ordering::Relaxed)+1);
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit am imm exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, scheduler, stall_mark, true).await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit am imm done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {
-            Builder::new()
-                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn_unchecked(future, schedule)
-        };
-        // println!("[{:?}] submit am imm running task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-        runnable.run();
-        task.detach();
-    }
-
-    //this is a serialized request
-    //#[tracing::instrument(skip_all)]
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        data: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ) {
-        // let work_inj = self.work_inj.clone();
-        // println!("submit work {:?}", self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let future = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            if let Some(header) = data.deserialize_header() {
-                let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, scheduler).await;
-            } else {
-                data.print();
-                panic!("should i be here?");
-            }
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        let (runnable, task) = unsafe {
-            Builder::new()
-                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn_unchecked(future, schedule)
-        };
-        // println!("[{:?}] submit work schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        // trace_span!("submit_task").in_scope(|| {
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let future2 = move |_cur_task: &_| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit task done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let (runnable, task) = Builder::new()
-            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-            .spawn(future2, schedule);
-        // println!("[{:?}] submit task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-
-        runnable.schedule();
-        task.detach();
-        // });
-    }
-
-    fn submit_immediate_task<F>(&self, future: F)
+impl LamellarExecutor for WorkStealing {
+    fn submit_task<F>(&self, task: F)
     where
-        F: Future<Output = ()> + Send + 'static,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
-        // trace_span!("submit_task").in_scope(|| {
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let future2 = move |_cur_task: &_| async move {
-            // println!("exec task {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit imm task exec req {:?} {:?} TaskId: {:?} ", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+        trace_span!("submit_task").in_scope(|| {
+            let work_inj = self.work_inj.clone();
+            let schedule = move |runnable| work_inj.push(runnable);
+            let (runnable, task) = Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn(move |_task_id| async move { task.await }, schedule);
 
-            future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit imm task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let (runnable, task) = Builder::new()
-            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-            .spawn(future2, schedule);
-        // println!("[{:?}] submit imm task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
-        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-        runnable.run(); //try to run immediately
-        task.detach();
-        // });
+            runnable.schedule();
+            task.detach();
+        });
     }
 
-    fn submit_immediate_task2<F>(&self, future: F)
+    fn submit_immediate_task<F>(&self, task: F)
     where
-        F: Future<Output = ()> + Send + 'static,
+        F: Future + Send + 'static,
+        F::Output: Send,
     {
-        // trace_span!("submit_task").in_scope(|| {
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let future2 = move |_cur_task: &_| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] submit imm2 task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-            future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            //     let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] submit imm2 task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-        };
-        let imm_inj = self.imm_inj.clone();
-        // let schedule = move |runnable| imm_inj.push(runnable);
-        let schedule = move |runnable| imm_inj.push(runnable);
-        // let (runnable, task) = unsafe { async_task::spawn(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let (runnable, task) = Builder::new()
-            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-            .spawn(future2, schedule);
-        // println!("[{:?}] submit imm2 task schedule task {:?} {:?} {:?}", std::thread::current().id(),runnable.metadata(),self.num_tasks.load(Ordering::Relaxed),self.max_tasks.load(Ordering::Relaxed));
+        trace_span!("submit_task").in_scope(|| {
+            let imm_inj = self.imm_inj.clone();
+            let schedule = move |runnable| imm_inj.push(runnable);
+            let (runnable, task) = Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn(move |_task_id| async move { task.await }, schedule);
 
-        runnable.schedule(); //try to run immediately
-        task.detach();
-        // });
+            runnable.schedule(); //try to run immediately
+            task.detach();
+        });
     }
 
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        // trace_span!("block_on").in_scope(|| {
-        // println!(
-        //     "[{:?}] work stealing block on -- num tasks {:?} max tasks {:?}  tasks executed {:?}",
-        //     std::thread::current().id(),
-        //     self.num_tasks.load(Ordering::Relaxed),
-        //     self.max_tasks.load(Ordering::Relaxed),
-        //     0
-        // );
-        let num_tasks = self.num_tasks.clone();
-        let max_tasks = self.max_tasks.clone();
-        let future2 = move |_cur_task| async move {
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("[{:?}] block on task exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-
-            let res = future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // let mut reqs = OUTSTANDING_REQS.lock();
-            // reqs.remove(cur_task);
-            // println!("[{:?}] block on task exec done {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task, reqs);
-            res
-        };
-        let work_inj = self.work_inj.clone();
-        // let schedule = move |runnable| work_inj.push(runnable);
-        let schedule = move |runnable| work_inj.push(runnable);
-
-        // let (runnable, mut task) = unsafe { async_task::spawn(future, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let (runnable, mut task) = unsafe {
-            Builder::new()
-                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn_unchecked(future2, schedule)
-        };
-        let waker = runnable.waker();
-        // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
-        runnable.run(); //try to run immediately
-                        // let mut s = std::time::Instant::now();
-                        // let mut cnt = 0;
-        while !task.is_finished() {
-            self.exec_task();
-            // if s.elapsed().as_secs() > 10 {
-            //     println!(
-            //         "[{:?}] work stealing block on timeout -- num tasks {:?} max tasks {:?}  tasks executed {:?} task id{:?} {:?}",
-            //         std::thread::current().id(),
-            //         self.num_tasks.load(Ordering::Relaxed),
-            //         self.max_tasks.load(Ordering::Relaxed),
-            //         cnt,
-            //         task.metadata(),
-            //         OUTSTANDING_REQS.lock(),
-            //     );
-            //     s = std::time::Instant::now();
-            //     break;
-            // }
-            // cnt += 1;
-            // std::thread::yield_now();
-        }
-        let cx = &mut async_std::task::Context::from_waker(&waker);
-        if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-            // println!(
-            //     "[{:?}] work stealing block on done -- num tasks {:?} max tasks {:?}  tasks executed {:?} task id{:?}",
-            //     std::thread::current().id(),
-            //     self.num_tasks.load(Ordering::Relaxed),
-            //     self.max_tasks.load(Ordering::Relaxed),
-            //     cnt,
-            //     task.metadata()
-            // );
-            output
-        } else {
-            println!(
-                    "[{:?}] work stealing block on failed -- num tasks {:?} max tasks {:?}   task id{:?}",
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| {
+            let work_inj = self.work_inj.clone();
+            let schedule = move |runnable| work_inj.push(runnable);
+            let (runnable, mut task) = unsafe {
+                Builder::new()
+                    .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                    .spawn_unchecked(move |_task_id| async move { task.await }, schedule)
+            };
+            let waker = runnable.waker();
+            runnable.run(); //try to run immediately
+            while !task.is_finished() {
+                self.exec_task(); //try to execute another task while this one is not ready
+            }
+            let cx = &mut async_std::task::Context::from_waker(&waker);
+            if let async_std::task::Poll::Ready(output) = task.poll(cx) {
+                output
+            } else {
+                println!(
+                    "[{:?}] work stealing block on failed --  task id{:?}",
                     std::thread::current().id(),
-                    self.num_tasks.load(Ordering::Relaxed),
-                    self.max_tasks.load(Ordering::Relaxed),
                     task.metadata()
                 );
-            panic!("task not ready");
-        }
-
-        // })
+                panic!("task not ready");
+            }
+        })
     }
 
     //#[tracing::instrument(skip_all)]
     fn shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(FINISHED, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
-        while self.panic.load(Ordering::SeqCst) == 0
-            && (self.active_cnt.load(Ordering::Relaxed) > 0 //num active threads
-            || self.num_tasks.load(Ordering::Relaxed) > 2)
+        while self.panic.load(Ordering::SeqCst) == 0 && self.active_cnt.load(Ordering::Relaxed) > 0
         {
-            //this should be the recvtask, and alloc_task
+            //num active threads
+            self.exec_task();
             std::thread::yield_now()
         }
-        // println!(
-        //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
-        //     self.active_cnt.load(Ordering::Relaxed),
-        //     self.active_cnt.load(Ordering::Relaxed)
-        // );
-    }
-
-    //#[tracing::instrument(skip_all)]
-    fn shutdown_threads(&self) {
-        self.active.store(FINISHED, Ordering::SeqCst);
     }
 
     //#[tracing::instrument(skip_all)]
     fn force_shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(PANIC, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
+        // println!("work stealing shuting down {:?}", self.status());
+
+        // println!("work stealing shuting down {:?}",self.status());
         let my_id = std::thread::current().id();
         if self.threads.iter().any(|e| e.thread().id() == my_id) {
-            // while self.active_cnt.load(Ordering::Relaxed) > 1 {//num active threads -- wait for all but myself
-            //     std::thread::yield_now()
-            // }
             self.active_cnt.fetch_sub(1, Ordering::SeqCst); // I paniced so I wont actually decrement
         } else {
             while self.active_cnt.load(Ordering::Relaxed) > 0 {
@@ -524,7 +215,7 @@ impl AmeSchedulerQueue for WorkStealingInner {
         }
         // println!(
         //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
+        //     self.status(),
         //     self.active_cnt.load(Ordering::Relaxed),
         //     self.active_cnt.load(Ordering::Relaxed)
         // );
@@ -550,138 +241,44 @@ impl AmeSchedulerQueue for WorkStealingInner {
             }
         };
         if let Some(runnable) = ret {
-            // if LAST_PRINTED_TASKS.load(Ordering::Relaxed) != self.num_tasks.load(Ordering::Relaxed) {
-            //     LAST_PRINTED_TASKS.store(self.num_tasks.load(Ordering::Relaxed), Ordering::Relaxed);
-            //     let work_stealers_lens = self.work_stealers.iter().map(|x| x.len()).collect::<Vec<_>>();
-            //     // println!("[{:?}] (exec_task) Executing task {:?}, num_tasks: {:?} {:?} {:?} {work_stealers_lens:?} {:?}", std::thread::current().id(), runnable.metadata(),self.num_tasks.load(Ordering::Relaxed), self.imm_inj.len(),self.work_inj.len(), OUTSTANDING_REQS.lock());
-            // }
-            // *OUTSTANDING_REQS.lock().entry(*runnable.metadata()).or_insert(0) += 1;
             runnable.run();
         }
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn active(&self) -> bool {
-        // println!("sched active {:?} {:?}",self.active.load(Ordering::SeqCst) , self.num_tasks.load(Ordering::SeqCst));
-        self.active.load(Ordering::SeqCst) == ACTIVE || self.num_tasks.load(Ordering::SeqCst) > 3
-    }
-}
-
-impl SchedulerQueue for Arc<WorkStealing> {
-    fn submit_am(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner.submit_am(self.clone(), self.ame.clone(), am);
-    }
-
-    fn submit_am_immediate(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner
-            .submit_am_immediate(self.clone(), self.ame.clone(), am);
-    }
-
-    // fn submit_return(&self, src, pe)
-
-    fn submit_work(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
-        self.inner
-            .submit_work(self.clone(), self.ame.clone(), data, lamellae);
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn submit_immediate_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        self.inner.submit_immediate_task(future);
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
     }
 
-    fn submit_immediate_task2<F>(&self, future: F)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        self.inner.submit_immediate_task2(future);
-    }
-
-    fn exec_task(&self) {
-        self.inner.exec_task();
-        std::thread::yield_now();
-    }
-
-    fn submit_task_node<F>(&self, future: F, _node: usize)
-    where
-        F: Future<Output = ()> + Send + 'static,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        self.inner.block_on(future)
-    }
-
-    fn shutdown(&self) {
-        self.inner.shutdown();
-    }
-
-    fn shutdown_threads(&self) {
-        self.inner.shutdown_threads();
-    }
-
-    fn force_shutdown(&self) {
-        self.inner.force_shutdown();
-    }
-    fn active(&self) -> bool {
-        self.inner.active()
-    }
     fn num_workers(&self) -> usize {
         self.max_num_threads
     }
 }
 
-impl WorkStealingInner {
-    //#[tracing::instrument(skip_all)]
+impl WorkStealing {
     pub(crate) fn new(
-        stall_mark: Arc<AtomicUsize>,
         num_workers: usize,
+        status: Arc<AtomicU8>,
         panic: Arc<AtomicU8>,
-        my_pe: usize,
-    ) -> WorkStealingInner {
+    ) -> WorkStealing {
         // println!("new work stealing queue");
-
-        let mut sched = WorkStealingInner {
+        let mut ws = WorkStealing {
+            max_num_threads: num_workers,
             threads: Vec::new(),
             imm_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_stealers: Vec::new(),
             work_flag: Arc::new(AtomicU8::new(0)),
-            active: Arc::new(AtomicU8::new(ACTIVE)),
+            status: status,
             active_cnt: Arc::new(AtomicUsize::new(0)),
-            num_tasks: Arc::new(AtomicUsize::new(0)),
-            max_tasks: Arc::new(AtomicUsize::new(0)),
-            stall_mark: stall_mark,
             panic: panic,
         };
-        sched.init(num_workers, my_pe);
-        sched
+        ws.init();
+        ws
     }
-
-    //#[tracing::instrument(skip_all)]
-    fn init(&mut self, num_workers: usize, my_pe: usize) {
+    #[tracing::instrument(skip_all)]
+    fn init(&mut self) {
         let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<Runnable<usize>>> = vec![];
-        for _i in 0..num_workers {
+        for _i in 0..self.max_num_threads {
             let work_worker: crossbeam::deque::Worker<Runnable<usize>> =
                 crossbeam::deque::Worker::new_fifo();
             self.work_stealers.push(work_worker.stealer());
@@ -701,7 +298,7 @@ impl WorkStealingInner {
             }
         };
         // println!("core_ids: {:?}",core_ids);
-        for i in 0..num_workers {
+        for i in 0..self.max_num_threads {
             let work_worker = work_workers.pop().unwrap();
             let worker = WorkStealingThread {
                 imm_inj: self.imm_inj.clone(),
@@ -709,17 +306,14 @@ impl WorkStealingInner {
                 work_stealers: self.work_stealers.clone(),
                 work_q: work_worker,
                 work_flag: self.work_flag.clone(),
-                active: self.active.clone(),
+                status: self.status.clone(),
                 panic: self.panic.clone(),
-                // num_tasks: self.num_tasks.clone(),
             };
             self.threads.push(WorkStealingThread::run(
                 worker,
                 self.active_cnt.clone(),
-                self.num_tasks.clone(),
-                self.max_tasks.clone(),
+                // self.num_tasks.clone(),
                 core_ids[i % core_ids.len()],
-                my_pe,
             ));
         }
         while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
@@ -728,54 +322,7 @@ impl WorkStealingInner {
     }
 }
 
-#[derive(Debug)]
-pub(crate) struct WorkStealing {
-    inner: Arc<AmeScheduler>,
-    ame: Arc<ActiveMessageEngineType>,
-    max_num_threads: usize, //including the main thread
-}
-impl WorkStealing {
-    //#[tracing::instrument(skip_all)]
-    pub(crate) fn new(
-        num_pes: usize,
-        num_workers: usize,
-        panic: Arc<AtomicU8>,
-        my_pe: usize,
-        // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
-    ) -> WorkStealing {
-        // println!("new work stealing queue");
-        let stall_mark = Arc::new(AtomicUsize::new(0));
-        let inner = Arc::new(AmeScheduler::WorkStealingInner(WorkStealingInner::new(
-            stall_mark.clone(),
-            num_workers,
-            panic.clone(),
-            my_pe,
-        )));
-
-        let batcher = match std::env::var("LAMELLAR_BATCHER") {
-            Ok(n) => {
-                let n = n.parse::<usize>().unwrap();
-                if n == 1 {
-                    BatcherType::Simple(SimpleBatcher::new(num_pes, stall_mark.clone()))
-                } else {
-                    BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone()))
-                }
-            }
-            Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone())),
-        };
-
-        let sched = WorkStealing {
-            inner: inner.clone(),
-            ame: Arc::new(ActiveMessageEngineType::RegisteredActiveMessages(Arc::new(
-                RegisteredActiveMessages::new(batcher),
-            ))),
-            max_num_threads: num_workers,
-        };
-        sched
-    }
-}
-
-impl Drop for WorkStealingInner {
+impl Drop for WorkStealing {
     //when is this called with respect to world?
     //#[tracing::instrument(skip_all)]
     fn drop(&mut self) {
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 4b62b69b..44ae69ab 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -436,7 +436,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
-            let _ = array.batch_add(&world.block_on(input_array.read_local_data()),1);
+            array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -445,7 +445,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
-            let _ = array.batch_add(&world.block_on(input_array.read_local_data()),1);
+            array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 78e85b72..80ce4761 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -539,7 +539,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&world.block_on(input_array.read_local_data()),1));
+            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
             check_results!($array,array,num_pes,reqs,"&LocalLockArray<T>");
 
             // GlobalLockArray<T>------------------------------
@@ -549,7 +549,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&world.block_on(input_array.read_local_data()),1));
+            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
             check_results!($array,array,num_pes,reqs,"&GlobalLockArray<T>");
        }
     }

From 9ed259b9f45d93c56edd2431d0b25a7418b2fdf4 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Thu, 8 Feb 2024 10:48:02 -0800
Subject: [PATCH 009/116] refactoring internal  block_on calls to async calls

---
 impl/src/array_ops.rs                         |  18 +-
 impl/src/array_reduce.rs                      |   2 +-
 src/array.rs                                  |  68 +++++++
 src/array/atomic.rs                           |  22 ++-
 src/array/generic_atomic.rs                   |  30 ++-
 src/array/generic_atomic/iteration.rs         |  12 +-
 src/array/global_lock_atomic.rs               |  24 ++-
 src/array/global_lock_atomic/iteration.rs     |  12 +-
 src/array/iterator/distributed_iterator.rs    | 182 +++++++++---------
 .../distributed_iterator/consumer/collect.rs  |  35 ++--
 src/array/iterator/local_iterator.rs          |  10 +-
 .../local_iterator/consumer/collect.rs        |  24 ++-
 src/array/local_lock_atomic.rs                |  24 ++-
 src/array/local_lock_atomic/iteration.rs      |  12 +-
 src/array/local_only.rs                       |  11 ++
 src/array/native_atomic.rs                    |  26 ++-
 src/array/native_atomic/iteration.rs          |  12 +-
 src/array/read_only.rs                        |  22 ++-
 src/array/read_only/iteration.rs              |  12 +-
 src/array/unsafe.rs                           | 168 +++++++++++++++-
 src/array/unsafe/iteration/distributed.rs     |  10 +-
 src/array/unsafe/iteration/local.rs           |   6 +-
 src/lamellar_task_group.rs                    |  21 ++
 src/scheduler.rs                              |  10 +-
 src/scheduler/tokio.rs                        |  88 ---------
 25 files changed, 586 insertions(+), 275 deletions(-)
 delete mode 100644 src/scheduler/tokio.rs

diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index 56a67524..96bd93ec 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -897,7 +897,7 @@ fn create_buf_ops(
             #[allow(non_snake_case)]
             fn #dist_multi_val_multi_idx_am_buf_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>, index_size: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                     Arc::new(#multi_val_multi_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         idx_vals: idx_vals,
                         index_size: index_size,
@@ -964,7 +964,7 @@ fn create_buf_ops(
                     let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                     let val = val_slice[0];
                     Arc::new(#single_val_multi_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         val: val,
                         indices: indicies,
@@ -1001,7 +1001,7 @@ fn create_buf_ops(
             #[allow(non_snake_case)]
             fn #dist_multi_val_single_idx_am_buf_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                     Arc::new(#multi_val_single_idx_am_buf_name{
-                        data: array.into(),
+                        data: Into::into(array),
                         op: op.into(),
                         vals: vals,
                         index: index,
@@ -1070,7 +1070,7 @@ fn create_buf_ops(
                 #[allow(non_snake_case)]
                 fn #dist_multi_val_multi_idx_am_buf_result_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>, index_size: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                         Arc::new(#multi_val_multi_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             idx_vals: idx_vals,
                             index_size: index_size,
@@ -1139,7 +1139,7 @@ fn create_buf_ops(
                         let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                         let val = val_slice[0];
                         Arc::new(#single_val_multi_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             val: val,
                             indices: indicies,
@@ -1178,7 +1178,7 @@ fn create_buf_ops(
                 #[allow(non_snake_case)]
                 fn #dist_multi_val_single_idx_am_buf_result_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                         Arc::new(#multi_val_single_idx_am_buf_result_name{
-                            data: array.into(),
+                            data: Into::into(array),
                             op: op.into(),
                             vals: vals,
                             index: index,
@@ -1251,7 +1251,7 @@ fn create_buf_ops(
         #[allow(non_snake_case)]
         fn #dist_multi_val_multi_idx_am_buf_fetch_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, idx_vals: Vec<u8>,index_usize: u8) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                 Arc::new(#multi_val_multi_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     idx_vals: idx_vals,
                     index_size: index_usize,
@@ -1320,7 +1320,7 @@ fn create_buf_ops(
                 let val_slice = unsafe {std::slice::from_raw_parts(val.as_ptr() as *const #typeident, std::mem::size_of::<#typeident>())};
                 let val = val_slice[0];
                 Arc::new(#single_val_multi_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     val: val,
                     indices: indicies,
@@ -1363,7 +1363,7 @@ fn create_buf_ops(
         #[allow(non_snake_case)]
         fn #dist_multi_val_single_idx_am_buf_fetch_name(array: #lamellar::array::LamellarByteArray, op: #lamellar::array::ArrayOpCmd<Vec<u8>>, vals: Vec<u8>, index: usize) -> Arc<dyn RemoteActiveMessage + Sync + Send>{
                 Arc::new(#multi_val_single_idx_am_buf_fetch_name{
-                    data: array.into(),
+                    data: Into::into(array),
                     op: op.into(),
                     vals: vals,
                     index: index,
diff --git a/impl/src/array_reduce.rs b/impl/src/array_reduce.rs
index d059f96f..ee7629e6 100644
--- a/impl/src/array_reduce.rs
+++ b/impl/src/array_reduce.rs
@@ -49,7 +49,7 @@ fn create_reduction(
 
         gen_match_stmts.extend(quote!{
             #lamellar::array::LamellarByteArray::#array_type(inner) => std::sync::Arc::new(#reduction_name{
-                data: unsafe {inner.clone().into()} , start_pe: 0, end_pe: num_pes-1}),
+                data: unsafe {Into::into(inner.clone())} , start_pe: 0, end_pe: num_pes-1}),
         });
 
         let iter_chain = if array_type == "AtomicArray"
diff --git a/src/array.rs b/src/array.rs
index 0dbea2d8..41a58b79 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -503,12 +503,63 @@ impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
     }
 }
 
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub(crate) trait AsyncInto<T>: Sized {
+    async fn async_into(self) -> T;
+}
+
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub(crate) trait AsyncFrom<T>: Sized {
+    async fn async_from(val: T) -> Self;
+}
+
+// AsyncFrom implies AsyncInto
+#[async_trait]
+impl<T, U> AsyncInto<U> for T
+where
+    T: Send,
+    U: AsyncFrom<T>,
+{
+    /// Calls `U::from(self).await`.
+    ///
+    /// That is, this conversion is whatever the implementation of
+    /// <code>[AsyncFrom]&lt;T&gt; for U</code> chooses to do.
+    #[inline]
+    async fn async_into(self) -> U {
+        U::async_from(self).await
+    }
+}
+
+// AsyncFrom (and thus Into) is reflexive
+// #[async_trait]
+// impl<T> AsyncFrom<T> for T
+// where
+//     T: Send,
+// {
+//     /// Returns the argument unchanged.
+//     #[inline(always)]
+//     async fn async_from(t: T) -> T {
+//         t
+//     }
+// }
+
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamFrom<T: ?Sized> {
     /// Converts to this type from the input type
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
 }
 
+#[async_trait]
+/// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
+/// and to be used within an async context
+pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> {
+    async fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
+}
+
 /// Provides the same abstraction as the `TryFrom` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamTryFrom<T: ?Sized> {
     /// Trys to convert to this type from the input type
@@ -522,6 +573,13 @@ pub trait TeamInto<T: ?Sized> {
     fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
 }
 
+/// Provides the same abstraction as the `Into` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated to be used within an async context
+#[async_trait]
+pub trait AsyncTeamInto<T: ?Sized> {
+    /// converts this type into the (usually inferred) input type
+    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
+}
+
 /// Provides the same abstraction as the `TryInto` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 
 pub trait TeamTryInto<T>: Sized {
@@ -538,6 +596,16 @@ where
     }
 }
 
+#[async_trait]
+impl<T: Send, U> AsyncTeamInto<U> for T
+where
+    U: AsyncTeamFrom<T>,
+{
+    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> U {
+        <U as AsyncTeamFrom<T>>::team_from(self, team).await
+    }
+}
+
 impl<T, U> TeamTryInto<U> for T
 where
     U: TeamTryFrom<T>,
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 896876b0..3d1f863f 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1079,11 +1079,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("Converting from UnsafeArray to AtomicArray");
@@ -1095,6 +1103,18 @@ impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist + 'static> AsyncFrom<UnsafeArray<T>> for AtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("Converting from UnsafeArray to AtomicArray");
+        if NATIVE_ATOMICS.contains(&TypeId::of::<T>()) {
+            NativeAtomicArray::async_from(array).await.into()
+        } else {
+            GenericAtomicArray::async_from(array).await.into()
+        }
+    }
+}
+
 // impl<T: Dist + 'static> From<LocalOnlyArray<T>> for AtomicArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("Converting from LocalOnlyArray to AtomicArray");
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 382059a4..e051719b 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -580,11 +580,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("generic from unsafe array");
@@ -602,6 +610,26 @@ impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GenericAtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("generic from unsafe array");
+        array
+            .await_on_outstanding(DarcMode::GenericAtomicArray)
+            .await;
+        let mut vec = vec![];
+        for _i in 0..array.num_elems_local() {
+            vec.push(Mutex::new(()));
+        }
+        let locks = Darc::new(array.team_rt(), vec).unwrap();
+
+        GenericAtomicArray {
+            locks: locks,
+            array: array,
+        }
+    }
+}
+
 impl<T: Dist> From<GenericAtomicArray<T>> for GenericAtomicByteArray {
     fn from(array: GenericAtomicArray<T>) -> Self {
         GenericAtomicByteArray {
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index 925f32fe..20563370 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -293,7 +293,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -307,7 +307,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -320,7 +320,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -335,7 +335,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -481,7 +481,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -495,7 +495,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 6b9ff9ef..48bf357b 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -664,11 +664,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T>
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("GlobalLock from unsafe");
@@ -682,6 +690,20 @@ impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("GlobalLock from unsafe");
+        array.await_on_outstanding(DarcMode::GlobalLockArray).await;
+        let lock = GlobalRwDarc::new(array.team_rt(), ()).unwrap();
+
+        GlobalLockArray {
+            lock: lock,
+            array: array,
+        }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for GlobalLockArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("GlobalLock from localonly");
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 0aeafafd..b6adbd47 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -501,7 +501,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -515,7 +515,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -528,7 +528,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -543,7 +543,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -689,7 +689,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -703,7 +703,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 56be3e22..90ec1cf4 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -39,8 +39,8 @@ pub(crate) use consumer::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::{private::*, IterRequest, Schedule};
 use crate::array::{
-    operations::ArrayOps, AtomicArray, Distribution, GenericAtomicArray, LamellarArray,
-    LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
+    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray,
+    LamellarArray, LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
 };
 use crate::lamellar_request::LamellarRequest;
 use crate::memregion::Dist;
@@ -55,10 +55,10 @@ use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 
-#[doc(hidden)]
-pub struct DistIterForEachHandle {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
-}
+// #[doc(hidden)]
+// pub struct DistIterForEachHandle {
+//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
+// }
 
 // impl Drop for DistIterForEachHandle {
 //     fn drop(&mut self) {
@@ -66,87 +66,87 @@ pub struct DistIterForEachHandle {
 //     }
 // }
 
-#[doc(hidden)]
-#[async_trait]
-impl IterRequest for DistIterForEachHandle {
-    type Output = ();
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.into_future().await;
-        }
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.get();
-        }
-    }
-}
-
-#[doc(hidden)]
-pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
-    pub(crate) distribution: Distribution,
-    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    pub(crate) _phantom: PhantomData<A>,
-}
+// #[doc(hidden)]
+// #[async_trait]
+// impl IterRequest for DistIterForEachHandle {
+//     type Output = ();
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         for req in self.reqs.drain(..) {
+//             req.into_future().await;
+//         }
+//     }
+//     fn wait(mut self: Box<Self>) -> Self::Output {
+//         for req in self.reqs.drain(..) {
+//             req.get();
+//         }
+//     }
+// }
 
-impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> DistIterCollectHandle<T, A> {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
-        self.team.tasking_barrier();
-        let local_sizes =
-            UnsafeArray::<usize>::new(self.team.clone(), self.team.num_pes, Distribution::Block);
-        unsafe {
-            local_sizes.local_as_mut_slice()[0] = local_vals.len();
-        }
-        local_sizes.barrier();
-        // local_sizes.print();
-        let mut size = 0;
-        let mut my_start = 0;
-        let my_pe = self.team.team_pe.expect("pe not part of team");
-        // local_sizes.print();
-        unsafe {
-            local_sizes
-                .onesided_iter()
-                .into_iter()
-                .enumerate()
-                .for_each(|(i, local_size)| {
-                    size += local_size;
-                    if i < my_pe {
-                        my_start += local_size;
-                    }
-                });
-        }
-        // println!("my_start {} size {}", my_start, size);
-        let array = UnsafeArray::<T>::new(self.team.clone(), size, self.distribution); //implcit barrier
+// #[doc(hidden)]
+// pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
+//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
+//     pub(crate) distribution: Distribution,
+//     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+//     pub(crate) _phantom: PhantomData<A>,
+// }
 
-        // safe because only a single reference to array on each PE
-        // we calculate my_start so that each pes local vals are guaranteed to not overwrite another pes values.
-        let _ = unsafe { array.put(my_start, local_vals) };
-        array.into()
-    }
-}
-#[async_trait]
-impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> IterRequest
-    for DistIterCollectHandle<T, A>
-{
-    type Output = A;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        let mut local_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.into_future().await;
-            local_vals.extend(v);
-        }
-        self.create_array(&local_vals)
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        let mut local_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.get();
-            local_vals.extend(v);
-        }
-        self.create_array(&local_vals)
-    }
-}
+// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> DistIterCollectHandle<T, A> {
+//     fn create_array(&self, local_vals: &Vec<T>) -> A {
+//         self.team.tasking_barrier();
+//         let local_sizes =
+//             UnsafeArray::<usize>::new(self.team.clone(), self.team.num_pes, Distribution::Block);
+//         unsafe {
+//             local_sizes.local_as_mut_slice()[0] = local_vals.len();
+//         }
+//         local_sizes.barrier();
+//         // local_sizes.print();
+//         let mut size = 0;
+//         let mut my_start = 0;
+//         let my_pe = self.team.team_pe.expect("pe not part of team");
+//         // local_sizes.print();
+//         unsafe {
+//             local_sizes
+//                 .onesided_iter()
+//                 .into_iter()
+//                 .enumerate()
+//                 .for_each(|(i, local_size)| {
+//                     size += local_size;
+//                     if i < my_pe {
+//                         my_start += local_size;
+//                     }
+//                 });
+//         }
+//         // println!("my_start {} size {}", my_start, size);
+//         let array = UnsafeArray::<T>::new(self.team.clone(), size, self.distribution); //implcit barrier
+
+//         // safe because only a single reference to array on each PE
+//         // we calculate my_start so that each pes local vals are guaranteed to not overwrite another pes values.
+//         unsafe { array.put(my_start, local_vals) };
+//         array.into()
+//     }
+// }
+// #[async_trait]
+// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> IterRequest
+//     for DistIterCollectHandle<T, A>
+// {
+//     type Output = A;
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         let mut local_vals = vec![];
+//         for req in self.reqs.drain(0..) {
+//             let v = req.into_future().await;
+//             local_vals.extend(v);
+//         }
+//         self.create_array(&local_vals)
+//     }
+//     fn wait(mut self: Box<Self>) -> Self::Output {
+//         let mut local_vals = vec![];
+//         for req in self.reqs.drain(0..) {
+//             let v = req.get();
+//             local_vals.extend(v);
+//         }
+//         self.create_array(&local_vals)
+//     }
+// }
 
 #[doc(hidden)]
 #[enum_dispatch]
@@ -212,7 +212,7 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -223,7 +223,7 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_async<I, A, B>(
         &self,
@@ -234,7 +234,7 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_async_with_schedule<I, A, B>(
         &self,
@@ -246,7 +246,7 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
 
     fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
     where
@@ -670,7 +670,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     where
         // &'static Self: DistributedIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect(self, d)
     }
@@ -716,7 +716,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         // &'static Self: DistributedIterator + 'static,
         T: Dist + ArrayOps,
         Self::Item: Future<Output = T> + Send + 'static,
-        A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect_async(self, d)
     }
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index d16e1826..b38d6e15 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
 use crate::array::iterator::{private::*, IterRequest};
 use crate::array::operations::ArrayOps;
-use crate::array::{Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
@@ -35,7 +35,7 @@ impl<I, A> IterConsumer for Collect<I, A>
 where
     I: DistributedIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
@@ -95,7 +95,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, B)>;
     type Output = A;
@@ -138,7 +138,7 @@ where
     I: DistributedIterator + Clone,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     fn clone(&self) -> Self {
         CollectAsync {
@@ -152,7 +152,7 @@ where
 #[doc(hidden)]
 pub struct DistIterCollectHandle<
     T: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
     pub(crate) distribution: Distribution,
@@ -160,16 +160,23 @@ pub struct DistIterCollectHandle<
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend>
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
     DistIterCollectHandle<T, A>
 {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
+    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
         let input = (local_vals, self.distribution);
-        input.team_into(&self.team)
+        let array: A = AsyncTeamInto::team_into(input, &self.team).await;
+        array
+    }
+
+    fn create_array(&self, local_vals: Vec<T>) -> A {
+        let input = (local_vals, self.distribution);
+        let array: A = TeamInto::team_into(input, &self.team);
+        array
     }
 }
 #[async_trait]
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend> IterRequest
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
     for DistIterCollectHandle<T, A>
 {
     type Output = A;
@@ -181,7 +188,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect::<Vec<_>>();
-        self.create_array(&local_vals)
+        self.async_create_array(local_vals).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -192,7 +199,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.create_array(local_vals)
     }
 }
 
@@ -207,7 +214,7 @@ impl<I, A> LamellarAm for CollectAm<I, A>
 where
     I: DistributedIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<I::Item> {
         let iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
@@ -221,7 +228,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     pub(crate) iter: CollectAsync<I, A, B>,
     pub(crate) schedule: IterSchedule,
@@ -233,7 +240,7 @@ where
     I: DistributedIterator,
     I::Item: Future<Output = B> + Send + 'static,
     B: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<(usize, B)> {
         let mut iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 47762323..df51405e 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -35,7 +35,7 @@ use zip::*;
 pub(crate) use consumer::*;
 
 use crate::array::iterator::{private::*, Schedule};
-use crate::array::{operations::ArrayOps, AtomicArray, Distribution, LamellarArray, TeamFrom};
+use crate::array::{operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, LamellarArray};
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
@@ -125,7 +125,7 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -136,7 +136,7 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
     // fn collect_async<I, A, B>(
     //     &self,
@@ -579,7 +579,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect(self, d)
     }
@@ -607,7 +607,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.array().collect_with_schedule(sched, self, d)
     }
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 1484b69b..4dea5332 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
 use crate::array::iterator::{private::*, IterRequest};
 use crate::array::operations::ArrayOps;
-use crate::array::{Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
@@ -34,7 +34,7 @@ impl<I, A> IterConsumer for Collect<I, A>
 where
     I: LocalIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
@@ -124,7 +124,7 @@ where
 #[doc(hidden)]
 pub struct LocalIterCollectHandle<
     T: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
     pub(crate) distribution: Distribution,
@@ -132,16 +132,20 @@ pub struct LocalIterCollectHandle<
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend>
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
     LocalIterCollectHandle<T, A>
 {
-    fn create_array(&self, local_vals: &Vec<T>) -> A {
+    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
         let input = (local_vals, self.distribution);
-        input.team_into(&self.team)
+        AsyncTeamInto::team_into(input, &self.team).await
+    }
+    fn create_array(&self, local_vals: Vec<T>) -> A {
+        let input = (local_vals, self.distribution);
+        TeamInto::team_into(input, &self.team)
     }
 }
 #[async_trait]
-impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncSend> IterRequest
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
     for LocalIterCollectHandle<T, A>
 {
     type Output = A;
@@ -153,7 +157,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.async_create_array(local_vals).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -164,7 +168,7 @@ impl<T: Dist + ArrayOps, A: for<'a> TeamFrom<(&'a Vec<T>, Distribution)> + SyncS
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(&local_vals)
+        self.create_array(local_vals)
     }
 }
 
@@ -188,7 +192,7 @@ impl<I, A> LamellarAm for CollectAm<I, A>
 where
     I: LocalIterator,
     I::Item: Dist + ArrayOps,
-    A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Vec<I::Item> {
         let iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index d52141ed..f08cc308 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -579,11 +579,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T>
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("locallock from unsafe");
@@ -597,6 +605,20 @@ impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("locallock from unsafe");
+        array.await_on_outstanding(DarcMode::LocalLockArray).await;
+        let lock = LocalRwDarc::new(array.team_rt(), ()).unwrap();
+
+        LocalLockArray {
+            lock: lock,
+            array: array,
+        }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for LocalLockArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("locallock from localonly");
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 983c269e..ae792da6 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -505,7 +505,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -519,7 +519,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -532,7 +532,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -547,7 +547,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -693,7 +693,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -707,7 +707,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/local_only.rs b/src/array/local_only.rs
index 5b931ad2..258d7856 100644
--- a/src/array/local_only.rs
+++ b/src/array/local_only.rs
@@ -99,6 +99,17 @@ impl<T: Dist> From<UnsafeArray<T>> for LocalOnlyArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalOnlyArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        array.await_on_outstanding(DarcMode::LocalOnlyArray).await;
+        LocalOnlyArray {
+            array: array,
+            _unsync: PhantomData,
+        }
+    }
+}
+
 impl<T: Dist> From<ReadOnlyArray<T>> for LocalOnlyArray<T> {
     fn from(array: ReadOnlyArray<T>) -> Self {
         unsafe { array.into_inner().into() }
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 590f9b48..9fc0e785 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -995,11 +995,19 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 #[doc(hidden)]
 impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
@@ -1013,6 +1021,22 @@ impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     }
 }
 
+#[doc(hidden)]
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for NativeAtomicArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("native from unsafe");
+        array
+            .await_on_outstanding(DarcMode::NativeAtomicArray)
+            .await;
+
+        NativeAtomicArray {
+            array: array,
+            orig_t: NativeAtomicType::from::<T>(),
+        }
+    }
+}
+
 #[doc(hidden)]
 impl<T: Dist> From<NativeAtomicArray<T>> for NativeAtomicByteArray {
     fn from(array: NativeAtomicArray<T>) -> Self {
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index 7c9fb394..e1231bde 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -295,7 +295,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -309,7 +309,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -322,7 +322,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -337,7 +337,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -483,7 +483,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -497,7 +497,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index f11eec7b..51002516 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -386,14 +386,22 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
 
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
+        array.async_into().await
+    }
+}
+
 impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for ReadOnlyArray<T> {
     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let array: UnsafeArray<T> = input.team_into(team);
+        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
         array.into()
     }
 }
@@ -407,6 +415,16 @@ impl<T: Dist> From<UnsafeArray<T>> for ReadOnlyArray<T> {
     }
 }
 
+#[async_trait]
+impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
+    async fn async_from(array: UnsafeArray<T>) -> Self {
+        // println!("readonly from UnsafeArray");
+        array.await_on_outstanding(DarcMode::ReadOnlyArray).await;
+
+        ReadOnlyArray { array: array }
+    }
+}
+
 // impl<T: Dist> From<LocalOnlyArray<T>> for ReadOnlyArray<T> {
 //     fn from(array: LocalOnlyArray<T>) -> Self {
 //         // println!("readonly from LocalOnlyArray");
diff --git a/src/array/read_only/iteration.rs b/src/array/read_only/iteration.rs
index e6f68976..af59f35a 100644
--- a/src/array/read_only/iteration.rs
+++ b/src/array/read_only/iteration.rs
@@ -116,7 +116,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -130,7 +130,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
@@ -143,7 +143,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async(&self.array, iter, d)
     }
@@ -158,7 +158,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
@@ -304,7 +304,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect(&self.array, iter, d)
     }
@@ -318,7 +318,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index e32a3d36..f690a9fa 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -203,6 +203,76 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         // println!("after buffered ops");
         // array.inner.data.print();
     }
+
+    async fn async_new<U: Into<IntoLamellarTeam>>(
+        team: U,
+        array_size: usize,
+        distribution: Distribution,
+    ) -> UnsafeArray<T> {
+        let team = team.into().team.clone();
+        team.async_barrier().await;
+        let task_group = LamellarTaskGroup::new(team.clone());
+        let my_pe = team.team_pe_id().unwrap();
+        let num_pes = team.num_pes();
+        let full_array_size = std::cmp::max(array_size, num_pes);
+
+        let elem_per_pe = full_array_size as f64 / num_pes as f64;
+        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
+                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
+        let rmr = MemoryRegion::new(
+            per_pe_size * std::mem::size_of::<T>(),
+            team.lamellae.clone(),
+            AllocationType::Global,
+        );
+        unsafe {
+            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
+                *elem = 0;
+            }
+        }
+
+        let data = Darc::try_new_with_drop(
+            team.clone(),
+            UnsafeArrayData {
+                mem_region: rmr,
+                array_counters: Arc::new(AMCounters::new()),
+                team: team.clone(),
+                task_group: Arc::new(task_group),
+                my_pe: my_pe,
+                num_pes: num_pes,
+                req_cnt: Arc::new(AtomicUsize::new(0)),
+            },
+            crate::darc::DarcMode::UnsafeArray,
+            None,
+        )
+        .expect("trying to create array on non team member");
+        let array = UnsafeArray {
+            inner: UnsafeArrayInner {
+                data: data,
+                distribution: distribution.clone(),
+                // wait: wait,
+                orig_elem_per_pe: elem_per_pe,
+                elem_size: std::mem::size_of::<T>(),
+                offset: 0,             //relative to size of T
+                size: full_array_size, //relative to size of T
+            },
+            phantom: PhantomData,
+        };
+        // println!("new unsafe");
+        // unsafe {println!("size {:?} bytes {:?}",array.inner.size, array.inner.data.mem_region.as_mut_slice().unwrap().len())};
+        // println!("elem per pe {:?}", elem_per_pe);
+        // for i in 0..num_pes{
+        //     println!("pe: {:?} {:?}",i,array.inner.num_elems_pe(i));
+        // }
+        // array.inner.data.print();
+        if full_array_size != array_size {
+            println!("WARNING: Array size {array_size} is less than number of pes {full_array_size}, each PE will not contain data");
+            array.sub_array(0..array_size)
+        } else {
+            array
+        }
+        // println!("after buffered ops");
+        // array.inner.data.print();
+    }
 }
 impl<T: Dist + 'static> UnsafeArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -364,6 +434,47 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.inner.data.team.clone()
     }
 
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        // let mut first = true;
+        while self
+            .inner
+            .data
+            .array_counters
+            .outstanding_reqs
+            .load(Ordering::SeqCst)
+            > 0
+            || self.inner.data.req_cnt.load(Ordering::SeqCst) > 0
+        {
+            // std::thread::yield_now();
+            // self.inner.data.team.flush();
+            // self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                //|| first{
+                println!(
+                    "in array await_all mype: {:?} cnt: {:?} {:?} {:?}",
+                    self.inner.data.team.world_pe,
+                    self.inner
+                        .data
+                        .array_counters
+                        .send_req_cnt
+                        .load(Ordering::SeqCst),
+                    self.inner
+                        .data
+                        .array_counters
+                        .outstanding_reqs
+                        .load(Ordering::SeqCst),
+                    self.inner.data.req_cnt.load(Ordering::SeqCst)
+                );
+                temp_now = Instant::now();
+                // first = false;
+            }
+        }
+        self.inner.data.task_group.await_all().await;
+        // println!("done in wait all {:?}",std::time::SystemTime::now());
+    }
+
     pub(crate) fn block_on_outstanding(&self, mode: DarcMode) {
         self.wait_all();
         // println!("block on outstanding");
@@ -374,6 +485,15 @@ impl<T: Dist + 'static> UnsafeArray<T> {
             .block_on(array_darc.block_on_outstanding(mode, 1)); //one for this instance of the array
     }
 
+    pub(crate) async fn await_on_outstanding(&self, mode: DarcMode) {
+        self.await_all().await;
+        // println!("block on outstanding");
+        // self.inner.data.print();
+        // let the_array: UnsafeArray<T> = self.clone();
+        let array_darc = self.inner.data.clone();
+        array_darc.block_on_outstanding(mode, 1).await;
+    }
+
     #[doc(alias = "Collective")]
     /// Convert this UnsafeArray into a (safe) [ReadOnlyArray][crate::array::ReadOnlyArray]
     ///
@@ -573,7 +693,44 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
-        input.team_into(team)
+        TeamInto::team_into(input, team)
+    }
+}
+
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        let (local_vals, distribution) = input;
+        // println!("local_vals len: {:?}", local_vals.len());
+        team.async_barrier().await;
+        let local_sizes =
+            UnsafeArray::<usize>::async_new(team.clone(), team.num_pes, Distribution::Block).await;
+        unsafe {
+            local_sizes.local_as_mut_slice()[0] = local_vals.len();
+        }
+        team.async_barrier().await;
+        // local_sizes.barrier();
+        let mut size = 0;
+        let mut my_start = 0;
+        let my_pe = team.team_pe.expect("pe not part of team");
+        unsafe {
+            local_sizes
+                .buffered_onesided_iter(team.num_pes)
+                .into_iter()
+                .enumerate()
+                .for_each(|(i, local_size)| {
+                    size += local_size;
+                    if i < my_pe {
+                        my_start += local_size;
+                    }
+                });
+        }
+        let array = UnsafeArray::<T>::async_new(team.clone(), size, distribution).await;
+        if local_vals.len() > 0 {
+            unsafe { array.put(my_start, local_vals).await };
+        }
+        team.async_barrier().await;
+        array
     }
 }
 
@@ -614,8 +771,6 @@ impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
 
 impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
     fn from(array: AtomicArray<T>) -> Self {
-        // println!("unsafe from atomic");
-        // array.into_unsafe()
         match array {
             AtomicArray::NativeAtomicArray(array) => UnsafeArray::<T>::from(array),
             AtomicArray::GenericAtomicArray(array) => UnsafeArray::<T>::from(array),
@@ -625,8 +780,6 @@ impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<NativeAtomicArray<T>> for UnsafeArray<T> {
     fn from(array: NativeAtomicArray<T>) -> Self {
-        // println!("unsafe from native atomic");
-        // let array = array.into_data();
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -634,8 +787,6 @@ impl<T: Dist> From<NativeAtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<GenericAtomicArray<T>> for UnsafeArray<T> {
     fn from(array: GenericAtomicArray<T>) -> Self {
-        // println!("unsafe from generic atomic");
-        // let array = array.into_data();
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -643,7 +794,6 @@ impl<T: Dist> From<GenericAtomicArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<LocalLockArray<T>> for UnsafeArray<T> {
     fn from(array: LocalLockArray<T>) -> Self {
-        // println!("unsafe from local lock atomic");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -651,7 +801,6 @@ impl<T: Dist> From<LocalLockArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<GlobalLockArray<T>> for UnsafeArray<T> {
     fn from(array: GlobalLockArray<T>) -> Self {
-        // println!("unsafe from global lock atomic");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
@@ -659,7 +808,6 @@ impl<T: Dist> From<GlobalLockArray<T>> for UnsafeArray<T> {
 
 impl<T: Dist> From<ReadOnlyArray<T>> for UnsafeArray<T> {
     fn from(array: ReadOnlyArray<T>) -> Self {
-        // println!("unsafe from read only");
         array.array.block_on_outstanding(DarcMode::UnsafeArray);
         array.array
     }
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index fc351d04..749071bd 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -2,7 +2,7 @@ use crate::active_messaging::SyncSend;
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::private::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, Distribution, LamellarArray, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, AsyncTeamInto, Distribution, LamellarArray, TeamFrom};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -142,7 +142,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_with_schedule(Schedule::Static, iter, d)
     }
@@ -156,7 +156,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = Collect {
             iter: iter.iter_clone(Sealed).monotonic(),
@@ -181,7 +181,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_async_with_schedule(Schedule::Static, iter, d)
     }
@@ -196,7 +196,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = CollectAsync {
             iter: iter.iter_clone(Sealed).monotonic(),
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 267c7901..bc56dd3d 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -2,7 +2,7 @@ use crate::active_messaging::SyncSend;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::private::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, Distribution, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -163,7 +163,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         self.collect_with_schedule(Schedule::Static, iter, d)
     }
@@ -177,7 +177,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: for<'a> TeamFrom<(&'a Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
         let collect = Collect {
             iter: iter.iter_clone(Sealed).monotonic(),
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 3df11061..dd6bf6f3 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -557,6 +557,27 @@ impl LamellarTaskGroup {
         }
     }
 
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
+            // self.team.flush();
+            // self.team.scheduler.exec_task();
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                println!(
+                    "in task group wait_all mype: {:?} cnt: {:?} {:?}",
+                    self.team.world_pe,
+                    self.team.team_counters.send_req_cnt.load(Ordering::SeqCst),
+                    self.team
+                        .team_counters
+                        .outstanding_reqs
+                        .load(Ordering::SeqCst),
+                );
+                temp_now = Instant::now();
+            }
+        }
+    }
+
     pub(crate) fn exec_am_all_inner<F>(
         &self,
         am: F,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 97d85179..0ba82e24 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -14,9 +14,9 @@ pub(crate) mod work_stealing;
 use work_stealing::WorkStealing;
 
 #[cfg(feature = "tokio-executor")]
-pub(crate) mod tokio;
+pub(crate) mod tokio_executor;
 #[cfg(feature = "tokio-executor")]
-use tokio::TokioRt;
+use tokio_executor::TokioRt;
 
 // ACTIVE ENUM
 // since atomic enums would be another dependecy
@@ -223,6 +223,12 @@ impl Scheduler {
     }
 
     pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            println!(
+                "trying to call block on within a worker thread {:?}",
+                std::backtrace::Backtrace::capture()
+            )
+        }
         self.executor.block_on(task)
     }
 
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
deleted file mode 100644
index f9e14ac1..00000000
--- a/src/scheduler/tokio.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
-
-use tokio::runtime::Runtime;
-
-use tracing::*;
-
-use async_task::{Builder, Runnable};
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-use rand::prelude::*;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-
-static TASK_ID: AtomicUsize = AtomicUsize::new(0);
-
-#[derive(Debug)]
-pub(crate) struct TokioRt {
-    max_num_threads: usize,
-    rt: Runtime,
-}
-
-impl LamellarExecutor for TokioRt {
-    fn submit_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn submit_immediate_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn block_on<F: Future>(&self, task: F) -> F::Output {
-        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn force_shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn exec_task(&self) {
-        // I dont think tokio has a way to do this
-    }
-
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
-
-    fn num_workers(&self) -> usize {
-        self.max_num_threads
-    }
-}
-
-impl TokioRt {
-    pub(crate) fn new(num_workers: usize) -> TokioRt {
-        // println!("New TokioRT with {} workers", num_workers);
-        TokioRt {
-            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
-            rt: tokio::runtime::Builder::new_multi_thread()
-                .worker_threads(num_workers + 1)
-                .enable_all()
-                .build()
-                .unwrap(),
-        }
-    }
-}

From 41016fe78c0ef38b8c48fe2ee38dcaaa0a166e47 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Thu, 8 Feb 2024 10:49:15 -0800
Subject: [PATCH 010/116] renaming tokio.rs -> tokio_executor.rs

---
 src/scheduler/tokio_executor.rs | 88 +++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 src/scheduler/tokio_executor.rs

diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
new file mode 100644
index 00000000..f9e14ac1
--- /dev/null
+++ b/src/scheduler/tokio_executor.rs
@@ -0,0 +1,88 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+
+use tokio::runtime::Runtime;
+
+use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::Worker;
+use futures::Future;
+use futures_lite::FutureExt;
+use rand::prelude::*;
+use std::panic;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc; //, Weak};
+use std::thread;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+pub(crate) struct TokioRt {
+    max_num_threads: usize,
+    rt: Runtime,
+}
+
+impl LamellarExecutor for TokioRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl TokioRt {
+    pub(crate) fn new(num_workers: usize) -> TokioRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        TokioRt {
+            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            rt: tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(num_workers + 1)
+                .enable_all()
+                .build()
+                .unwrap(),
+        }
+    }
+}

From 3111d81bcd86c36cf2d0d0b337d5130a9f2e85f6 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 26 Jan 2024 22:05:55 -0800
Subject: [PATCH 011/116] refactoring to suppport different executor backends +
 tokio backend

---
 src/array/operations.rs | 46 ++++++++++-----------
 src/scheduler/tokio.rs  | 88 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 23 deletions(-)
 create mode 100644 src/scheduler/tokio.rs

diff --git a/src/array/operations.rs b/src/array/operations.rs
index 104b37a8..a16f7bf1 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -244,29 +244,29 @@ pub enum OpInputEnum<'a, T: Dist> {
     // AtomicArray(AtomicArray<T>),
 }
 
-impl<'a, T: Dist> OpInputEnum<'_, T> {
-    //#[tracing::instrument(skip_all)]
-    // pub(crate) fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
-    //     match self {
-    //         OpInputEnum::Val(v) => Box::new(std::iter::repeat(v).map(|elem| *elem)),
-    //         OpInputEnum::Slice(s) => Box::new(s.iter().map(|elem| *elem)),
-    //         OpInputEnum::Vec(v) => Box::new(v.iter().map(|elem| *elem)),
-    //         OpInputEnum::NativeAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
-    //         OpInputEnum::GenericAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
-    //         OpInputEnum::LocalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
-    //         OpInputEnum::GlobalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
-    //         // OpInputEnum::MemoryRegion(mr) => Box::new(
-    //         //     unsafe { mr.as_slice() }
-    //         //         .expect("memregion not local")
-    //         //         .iter()
-    //         //         .map(|elem| *elem),
-    //         // ),
-    //         // OpInputEnum::UnsafeArray(a) => Box::new(unsafe{a.local_data()}.iter().map(|elem| *elem)),
-    //         // OpInputEnum::ReadOnlyArray(a) => Box::new(a.local_data().iter().map(|elem| *elem)),
-    //         // OpInputEnum::AtomicArray(a) => Box::new(a.local_data().iter().map(|elem| elem.load())),
-    //     }
-    // }
-    //#[tracing::instrument(skip_all)]
+impl<'a, T: Dist> OpInputEnum<'a, T> {
+    #[tracing::instrument(skip_all)]
+    pub(crate) fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
+        match self {
+            OpInputEnum::Val(v) => Box::new(std::iter::repeat(v).map(|elem| *elem)),
+            OpInputEnum::Slice(s) => Box::new(s.iter().map(|elem| *elem)),
+            OpInputEnum::Vec(v) => Box::new(v.iter().map(|elem| *elem)),
+            OpInputEnum::NativeAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
+            OpInputEnum::GenericAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
+            OpInputEnum::LocalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
+            OpInputEnum::GlobalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
+            // OpInputEnum::MemoryRegion(mr) => Box::new(
+            //     unsafe { mr.as_slice() }
+            //         .expect("memregion not local")
+            //         .iter()
+            //         .map(|elem| *elem),
+            // ),
+            // OpInputEnum::UnsafeArray(a) => Box::new(unsafe{a.local_data()}.iter().map(|elem| *elem)),
+            // OpInputEnum::ReadOnlyArray(a) => Box::new(a.local_data().iter().map(|elem| *elem)),
+            // OpInputEnum::AtomicArray(a) => Box::new(a.local_data().iter().map(|elem| elem.load())),
+        }
+    }
+    #[tracing::instrument(skip_all)]
     pub(crate) fn len(&self) -> usize {
         match self {
             OpInputEnum::Val(_) => 1,
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
new file mode 100644
index 00000000..f9e14ac1
--- /dev/null
+++ b/src/scheduler/tokio.rs
@@ -0,0 +1,88 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+
+use tokio::runtime::Runtime;
+
+use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::Worker;
+use futures::Future;
+use futures_lite::FutureExt;
+use rand::prelude::*;
+use std::panic;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc; //, Weak};
+use std::thread;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+pub(crate) struct TokioRt {
+    max_num_threads: usize,
+    rt: Runtime,
+}
+
+impl LamellarExecutor for TokioRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        trace_span!("submit_task").in_scope(|| {
+            self.rt.spawn(async move { task.await });
+        });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl TokioRt {
+    pub(crate) fn new(num_workers: usize) -> TokioRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        TokioRt {
+            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            rt: tokio::runtime::Builder::new_multi_thread()
+                .worker_threads(num_workers + 1)
+                .enable_all()
+                .build()
+                .unwrap(),
+        }
+    }
+}

From 93529d5e96b7d5a8e210f4146d179a18d287d177 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Thu, 8 Feb 2024 10:48:02 -0800
Subject: [PATCH 012/116] refactoring internal  block_on calls to async calls

---
 src/array/unsafe.rs    | 70 +++++++++++++++++++++++++++++++++
 src/scheduler/tokio.rs | 88 ------------------------------------------
 2 files changed, 70 insertions(+), 88 deletions(-)
 delete mode 100644 src/scheduler/tokio.rs

diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index f690a9fa..e510b9ff 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -273,6 +273,76 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         // println!("after buffered ops");
         // array.inner.data.print();
     }
+
+    async fn async_new<U: Into<IntoLamellarTeam>>(
+        team: U,
+        array_size: usize,
+        distribution: Distribution,
+    ) -> UnsafeArray<T> {
+        let team = team.into().team.clone();
+        team.async_barrier().await;
+        let task_group = LamellarTaskGroup::new(team.clone());
+        let my_pe = team.team_pe_id().unwrap();
+        let num_pes = team.num_pes();
+        let full_array_size = std::cmp::max(array_size, num_pes);
+
+        let elem_per_pe = full_array_size as f64 / num_pes as f64;
+        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
+                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
+        let rmr = MemoryRegion::new(
+            per_pe_size * std::mem::size_of::<T>(),
+            team.lamellae.clone(),
+            AllocationType::Global,
+        );
+        unsafe {
+            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
+                *elem = 0;
+            }
+        }
+
+        let data = Darc::try_new_with_drop(
+            team.clone(),
+            UnsafeArrayData {
+                mem_region: rmr,
+                array_counters: Arc::new(AMCounters::new()),
+                team: team.clone(),
+                task_group: Arc::new(task_group),
+                my_pe: my_pe,
+                num_pes: num_pes,
+                req_cnt: Arc::new(AtomicUsize::new(0)),
+            },
+            crate::darc::DarcMode::UnsafeArray,
+            None,
+        )
+        .expect("trying to create array on non team member");
+        let array = UnsafeArray {
+            inner: UnsafeArrayInner {
+                data: data,
+                distribution: distribution.clone(),
+                // wait: wait,
+                orig_elem_per_pe: elem_per_pe,
+                elem_size: std::mem::size_of::<T>(),
+                offset: 0,             //relative to size of T
+                size: full_array_size, //relative to size of T
+            },
+            phantom: PhantomData,
+        };
+        // println!("new unsafe");
+        // unsafe {println!("size {:?} bytes {:?}",array.inner.size, array.inner.data.mem_region.as_mut_slice().unwrap().len())};
+        // println!("elem per pe {:?}", elem_per_pe);
+        // for i in 0..num_pes{
+        //     println!("pe: {:?} {:?}",i,array.inner.num_elems_pe(i));
+        // }
+        // array.inner.data.print();
+        if full_array_size != array_size {
+            println!("WARNING: Array size {array_size} is less than number of pes {full_array_size}, each PE will not contain data");
+            array.sub_array(0..array_size)
+        } else {
+            array
+        }
+        // println!("after buffered ops");
+        // array.inner.data.print();
+    }
 }
 impl<T: Dist + 'static> UnsafeArray<T> {
     #[doc(alias("One-sided", "onesided"))]
diff --git a/src/scheduler/tokio.rs b/src/scheduler/tokio.rs
deleted file mode 100644
index f9e14ac1..00000000
--- a/src/scheduler/tokio.rs
+++ /dev/null
@@ -1,88 +0,0 @@
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
-
-use tokio::runtime::Runtime;
-
-use tracing::*;
-
-use async_task::{Builder, Runnable};
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-use rand::prelude::*;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-
-static TASK_ID: AtomicUsize = AtomicUsize::new(0);
-
-#[derive(Debug)]
-pub(crate) struct TokioRt {
-    max_num_threads: usize,
-    rt: Runtime,
-}
-
-impl LamellarExecutor for TokioRt {
-    fn submit_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn submit_immediate_task<F>(&self, task: F)
-    where
-        F: Future + Send + 'static,
-        F::Output: Send,
-    {
-        trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
-        });
-    }
-
-    fn block_on<F: Future>(&self, task: F) -> F::Output {
-        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn force_shutdown(&self) {
-        // i think we just let tokio do this on drop
-    }
-
-    #[tracing::instrument(skip_all)]
-    fn exec_task(&self) {
-        // I dont think tokio has a way to do this
-    }
-
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
-
-    fn num_workers(&self) -> usize {
-        self.max_num_threads
-    }
-}
-
-impl TokioRt {
-    pub(crate) fn new(num_workers: usize) -> TokioRt {
-        // println!("New TokioRT with {} workers", num_workers);
-        TokioRt {
-            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
-            rt: tokio::runtime::Builder::new_multi_thread()
-                .worker_threads(num_workers + 1)
-                .enable_all()
-                .build()
-                .unwrap(),
-        }
-    }
-}

From 43d44235165fb522e66434d480d4fe7964fd8d2e Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 15 Feb 2024 09:54:15 -0800
Subject: [PATCH 013/116] all examples compile, not all complete

---
 .../local_lock_atomic_array_put_bw.rs         |   1 -
 src/active_messaging.rs                       |   8 +-
 .../batching/simple_batcher.rs                |  18 +-
 .../batching/team_am_batcher.rs               |  89 ++-
 .../registered_active_message.rs              | 107 ++--
 src/array.rs                                  |  21 +
 src/array/iterator/distributed_iterator.rs    |   4 +-
 .../distributed_iterator/consumer/collect.rs  |   2 +-
 .../local_iterator/consumer/collect.rs        |   2 +-
 src/array/unsafe.rs                           |  70 ---
 src/array/unsafe/iteration/distributed.rs     |   2 +-
 src/array/unsafe/operations.rs                |   1 -
 src/darc.rs                                   |  40 --
 src/lamellar_request.rs                       |  18 +-
 src/lamellar_world.rs                         |  35 +-
 src/scheduler.rs                              |  68 ++-
 src/scheduler/numa_work_stealing.rs           | 552 -----------------
 src/scheduler/numa_work_stealing2.rs          | 569 ------------------
 src/scheduler/tokio_executor.rs               |  14 +-
 tests/array/arithmetic_ops/add_test.rs        |   4 +-
 20 files changed, 261 insertions(+), 1364 deletions(-)
 delete mode 100644 src/scheduler/numa_work_stealing.rs
 delete mode 100644 src/scheduler/numa_work_stealing2.rs

diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index d75402eb..fe4861f9 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -67,7 +67,6 @@ fn main() {
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == num_pes - 1 {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let array_clone = array.clone();
                 let local_data = array.blocking_read_local_data();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index afc8ab6c..0559dc29 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -1177,13 +1177,7 @@ pub trait ActiveMessaging {
 
 #[async_trait]
 pub(crate) trait ActiveMessageEngine {
-    async fn process_msg(
-        self,
-        am: Am,
-        scheduler: Arc<Executor>,
-        stall_mark: usize,
-        immediate: bool,
-    );
+    async fn process_msg(self, am: Am, stall_mark: usize, immediate: bool);
 
     async fn exec_msg(
         self,
diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index c63c3fa6..fbea7277 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -86,7 +86,7 @@ impl Batcher for SimpleBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("remote batch_id {batch_id} created {dst:?}");
+            // println!("remote batch_id {batch_id} created ");
             let cur_stall_mark = self.stall_mark.clone();
             // println!(
             //     "[{:?}] add_remote_am_to_batch submit task",
@@ -104,7 +104,7 @@ impl Batcher for SimpleBatcher {
                 SimpleBatcher::create_tx_task(batch).await;
             }
         } else if size >= MAX_BATCH_SIZE {
-            // println!("remote size: {:?} {dst:?}",size);
+            // println!("remote size: {:?} ", size);
             // println!(
             //     "[{:?}] add_remote_am_to_batch submit imm task",
             //     std::thread::current().id()
@@ -236,7 +236,7 @@ impl Batcher for SimpleBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("unit batch_id {batch_id} created {dst:?}");
+            // println!("unit batch_id {batch_id} created ");
             let cur_stall_mark = self.stall_mark.clone();
             // println!(
             //     "[{:?}] add_unit_am_to_batch submit task",
@@ -254,7 +254,7 @@ impl Batcher for SimpleBatcher {
                 SimpleBatcher::create_tx_task(batch).await;
             }
         } else if size >= MAX_BATCH_SIZE {
-            // println!("unit size: {:?} {dst:?}",size);
+            // println!("unit size: {:?} ", size);
             // println!(
             //     "[{:?}] add_unit_am_to_batch submit imm task",
             //     std::thread::current().id()
@@ -288,7 +288,9 @@ impl Batcher for SimpleBatcher {
                 }
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
-                Cmd::BatchedMsg => panic!("should not recieve a batched msg within a batched msg"),
+                Cmd::BatchedMsg => {
+                    panic!("should not recieve a batched msg within a Simple Batcher batched msg")
+                }
             }
         }
         return_ams
@@ -312,6 +314,7 @@ impl SimpleBatcher {
     //#[tracing::instrument(skip_all)]
     async fn create_tx_task(batch: SimpleBatcherInner) {
         // println!("[{:?}] create_tx_task", std::thread::current().id());
+        async_std::task::yield_now().await; // force this to renter the task queue so other requests can hopefully come in before sending the batch
         let (buf, size) = batch.swap();
 
         if size > 0 {
@@ -533,7 +536,10 @@ impl SimpleBatcher {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        // println!("[{:?}] exec_am submit task", std::thread::current().id());
+        // println!(
+        //     "[{:?}] simple batcher exec_am submit task",
+        //     std::thread::current().id()
+        // );
         let am = match am
             .exec(
                 team.team.world_pe,
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index 67bb7b09..43af51ed 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -88,8 +88,9 @@ impl TeamAmBatcherInner {
             .or_insert_with(|| HashMap::new());
         if team_batch.len() == 0 {
             temp_size += *TEAM_HEADER_LEN;
-            //println!(
-            //     "adding team header {} {} {}",
+            // println!(
+            //     "[{:?}] adding team header {} {} {}",
+            //     std::thread::current().id(),
             //     temp_size,
             //     *TEAM_HEADER_LEN,
             //     self.size.load(Ordering::SeqCst)
@@ -98,8 +99,9 @@ impl TeamAmBatcherInner {
         let am_batch = team_batch.entry(id).or_insert_with(|| Vec::new());
         if am_batch.len() == 0 {
             temp_size += *BATCHED_AM_HEADER_LEN;
-            //println!(
-            //     "adding batched header {} {} {}",
+            // println!(
+            //     "[{:?}] adding batched header {} {} {}",
+            //     std::thread::current().id(),
             //     temp_size,
             //     *BATCHED_AM_HEADER_LEN,
             //     self.size.load(Ordering::SeqCst)
@@ -107,8 +109,9 @@ impl TeamAmBatcherInner {
         }
         am_batch.push((req_data, am, size));
         temp_size += size + *REQ_ID_LEN;
-        //println!(
-        //     "adding req_id + size header {} {} {} {}",
+        // println!(
+        //     "[{:?}] adding req_id + size header {} {} {} {}",
+        //     std::thread::current().id(),
         //     temp_size,
         //     *REQ_ID_LEN,
         //     size,
@@ -184,6 +187,7 @@ impl Batcher for TeamAmBatcher {
         am_size: usize,
         mut stall_mark: usize,
     ) {
+        // println!("[{:?}] add_remote_am_to_batch", std::thread::current().id());
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -195,7 +199,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("remote batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] remote batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -216,7 +223,11 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("remote size: {:?}",size);
+            // println!(
+            //     "[{:?}] remote size: {:?}",
+            //     std::thread::current().id(),
+            //     size
+            // );
             TeamAmBatcher::create_tx_task(
                 batch,
                 req_data.lamellae.clone(),
@@ -236,6 +247,7 @@ impl Batcher for TeamAmBatcher {
         am_size: usize,
         mut stall_mark: usize,
     ) {
+        // println!("[{:?}] add_return_am_to_batch", std::thread::current().id(),);
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -247,7 +259,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("return batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] return batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -268,7 +283,11 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("return size: {:?}",size);
+            // println!(
+            //     "[{:?}] return size: {:?}",
+            //     std::thread::current().id(),
+            //     size
+            // );
 
             TeamAmBatcher::create_tx_task(
                 batch,
@@ -288,6 +307,7 @@ impl Batcher for TeamAmBatcher {
         data_size: usize,
         mut stall_mark: usize,
     ) {
+        // println!("[{:?}] add_data_am_to_batch", std::thread::current().id(),);
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -306,7 +326,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("data batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] data batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -327,7 +350,7 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("data size: {:?}",size);
+            // println!("[{:?}] data size: {:?}", std::thread::current().id(), size);
             TeamAmBatcher::create_tx_task(
                 batch,
                 req_data.lamellae.clone(),
@@ -340,6 +363,7 @@ impl Batcher for TeamAmBatcher {
 
     // #[tracing::instrument(skip_all)]
     async fn add_unit_am_to_batch(&self, req_data: ReqMetaData, mut stall_mark: usize) {
+        // println!("[{:?}] add_unit_am_to_batch", std::thread::current().id(),);
         let batch = match req_data.dst {
             Some(dst) => self.batched_ams[dst].clone(),
             None => self.batched_ams.last().unwrap().clone(),
@@ -351,7 +375,10 @@ impl Batcher for TeamAmBatcher {
         if size == 0 {
             //first data in batch, schedule a transfer task
             let batch_id = batch.batch_id.load(Ordering::SeqCst);
-            // println!("unit batch_id {batch_id} created");
+            // println!(
+            //     "[{:?}] unit batch_id {batch_id} created",
+            //     std::thread::current().id()
+            // );
             let cur_stall_mark = self.stall_mark.clone();
             while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                 && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
@@ -372,7 +399,7 @@ impl Batcher for TeamAmBatcher {
             }
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
-            // println!("unit size: {:?}",size);
+            // println!("[{:?}] unit size: {:?}", std::thread::current().id(), size);
             TeamAmBatcher::create_tx_task(
                 batch,
                 req_data.lamellae.clone(),
@@ -391,15 +418,18 @@ impl Batcher for TeamAmBatcher {
         lamellae: Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
     ) -> Vec<Am> {
+        // println!("[{:?}] exec_batched_msg", std::thread::current().id());
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("i: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
+        let mut return_ams = Vec::new();
         while i < data.len() {
             // println!("\ti: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
             let batch: BatchHeader =
                 crate::deserialize(&data[i..i + *BATCH_HEADER_LEN], false).unwrap();
             // println!("batch {:?} i: {} len: {}", batch, i, data.len());
             i += *BATCH_HEADER_LEN;
+            // println!("[{:?}] cmd {:?}", std::thread::current().id(), batch.cmd);
             match batch.cmd {
                 Cmd::Am | Cmd::ReturnAm => {
                     panic!("should not encounter individual am cmds in TeamAmBatcher")
@@ -407,12 +437,15 @@ impl Batcher for TeamAmBatcher {
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => {
-                    self.exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
-                        .await;
+                    return_ams.append(
+                        &mut self
+                            .exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
+                            .await,
+                    );
                 }
             }
         }
-        Vec::new()
+        return_ams
     }
 }
 
@@ -436,6 +469,8 @@ impl TeamAmBatcher {
         arch: Arc<LamellarArchRT>,
         my_pe: usize,
     ) {
+        // println!("[{:?}] create_tx_task", std::thread::current().id());
+        async_std::task::yield_now().await; // force this to renter the task queue so other requests can hopefully come in before sending the batch
         let (am_batch, return_am_batch, non_am_batch, mut size) = batch.swap();
         if size > 0 {
             if am_batch.len() > 0 {
@@ -448,7 +483,11 @@ impl TeamAmBatcher {
             let data_buf = TeamAmBatcher::create_data_buf(header, size, &lamellae).await;
             let data_slice = data_buf.data_as_bytes();
 
-            //println!("total batch size: {}", size);
+            // println!(
+            //     "[{:?}] total batch size: {}",
+            //     std::thread::current().id(),
+            //     size
+            // );
             let mut i = 0;
             TeamAmBatcher::serialize_am_batch(am_batch, data_slice, &mut i, Cmd::Am);
             TeamAmBatcher::serialize_am_batch(return_am_batch, data_slice, &mut i, Cmd::ReturnAm);
@@ -683,6 +722,7 @@ impl TeamAmBatcher {
         ame: &RegisteredActiveMessages,
     ) -> Vec<Am> {
         let mut return_ams = Vec::new();
+        // println!("exec_batched_am batch_cnt: {}", batch_cnt);
         for _team in 0..batch_cnt {
             let team_header: TeamHeader =
                 crate::deserialize(&data[*i..*i + *TEAM_HEADER_LEN], false).unwrap();
@@ -697,7 +737,11 @@ impl TeamAmBatcher {
                 // println!("batched am header: {:?}", batched_am_header);
                 *i += *BATCHED_AM_HEADER_LEN;
                 for _am in 0..batched_am_header.am_cnt {
-                    // println!("am cmd: {:?}", batched_am_header.cmd);
+                    // println!(
+                    //     "[{:?}] am cmd: {:?}",
+                    //     std::thread::current().id(),
+                    //     batched_am_header.cmd
+                    // );
                     match batched_am_header.cmd {
                         Cmd::Am => return_ams.push(
                             self.exec_am(
@@ -729,6 +773,11 @@ impl TeamAmBatcher {
                 }
             }
         }
+        // println!(
+        //     "[{:?}] return_ams: {:?}",
+        //     std::thread::current().id(),
+        //     return_ams
+        // );
         return_ams
     }
 
@@ -747,6 +796,7 @@ impl TeamAmBatcher {
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
         *i += am.serialized_size();
+        // println!("Team Batcher exec am");
 
         let req_data = ReqMetaData {
             src: team.team.world_pe,
@@ -791,6 +841,7 @@ impl TeamAmBatcher {
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
     ) {
+        // println!("[{:?}] exec_return_am", std::thread::current().id());
         let req_id = crate::deserialize(&data[*i..*i + *REQ_ID_LEN], false).unwrap();
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index e64b8874..fcc017e3 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -96,16 +96,10 @@ pub(crate) struct UnitHeader {
 }
 
 #[async_trait]
-impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
-    //#[tracing::instrument(skip_all)]
-    async fn process_msg(
-        self,
-        am: Am,
-        executor: Arc<Executor>,
-        stall_mark: usize,
-        immediate: bool,
-    ) {
-        // println!("[{:?}] {am:?}", std::thread::current().id());
+impl ActiveMessageEngine for RegisteredActiveMessages {
+    // #[tracing::instrument(skip_all)]
+    async fn process_msg(self, am: Am, stall_mark: usize, immediate: bool) {
+        // println!("[{:?}] process_msg {am:?}", std::thread::current().id());
         match am {
             Am::All(req_data, am) => {
                 // println!("{:?}",am.get_id());
@@ -127,6 +121,12 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                             )
                             .await;
                     } else {
+                        // println!(
+                        //     "[{:?}] {:?} all {:?}",
+                        //     std::thread::current().id(),
+                        //     am_id,
+                        //     am_size
+                        // );
                         self.send_am(req_data.clone(), am.clone(), am_id, am_size, Cmd::Am)
                             .await;
                     }
@@ -134,8 +134,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 let world = LamellarTeam::new(None, req_data.world.clone(), true);
                 let team = LamellarTeam::new(Some(world.clone()), req_data.team.clone(), true);
                 if req_data.team.arch.team_pe(req_data.src).is_ok() {
-                    self.clone()
-                        .exec_local_am(req_data, am.as_local(), world, team)
+                    self.exec_local_am(req_data, am.as_local(), world, team)
                         .await;
                 }
             }
@@ -143,8 +142,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                 if req_data.dst == Some(req_data.src) {
                     let world = LamellarTeam::new(None, req_data.world.clone(), true);
                     let team = LamellarTeam::new(Some(world.clone()), req_data.team.clone(), true);
-                    self.clone()
-                        .exec_local_am(req_data, am.as_local(), world, team)
+                    self.exec_local_am(req_data, am.as_local(), world, team)
                         .await;
                 } else {
                     let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
@@ -154,6 +152,12 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                             .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                             .await;
                     } else {
+                        // println!(
+                        //     "[{:?}] {:?} pe {:?}",
+                        //     std::thread::current().id(),
+                        //     am_id,
+                        //     am_size
+                        // );
                         self.send_am(req_data, am, am_id, am_size, Cmd::Am).await;
                     }
                 }
@@ -161,7 +165,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
             Am::Local(req_data, am) => {
                 let world = LamellarTeam::new(None, req_data.world.clone(), true);
                 let team = LamellarTeam::new(Some(world.clone()), req_data.team.clone(), true);
-                self.clone().exec_local_am(req_data, am, world, team).await;
+                self.exec_local_am(req_data, am, world, team).await;
             }
             Am::Return(req_data, am) => {
                 // println!("Am::Return");
@@ -172,6 +176,12 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                         .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                         .await;
                 } else {
+                    // println!(
+                    //     "[{:?}] {:?} return {:?}",
+                    //     std::thread::current().id(),
+                    //     am_id,
+                    //     am_size
+                    // );
                     self.send_am(req_data, am, am_id, am_size, Cmd::ReturnAm)
                         .await;
                 }
@@ -184,6 +194,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                         .add_data_am_to_batch(req_data, data, data_size, stall_mark)
                         .await;
                 } else {
+                    // println!("[{:?}] data {:?}", std::thread::current().id(), data_size);
                     self.send_data_am(req_data, data, data_size).await;
                 }
             }
@@ -193,6 +204,11 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                         .add_unit_am_to_batch(req_data, stall_mark)
                         .await;
                 } else {
+                    // println!(
+                    //     "[{:?}]  unit {:?}",
+                    //     std::thread::current().id(),
+                    //     *UNIT_HEADER_LEN
+                    // );
                     self.send_unit_am(req_data).await;
                 }
             }
@@ -207,13 +223,13 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
         lamellae: Arc<Lamellae>,
         executor: Arc<Executor>,
     ) {
-        // println!("exec_msg");
+        // println!("[{:?}] exec_msg {:?}", std::thread::current().id(), msg.cmd);
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         match msg.cmd {
             Cmd::Am => {
                 let return_am = self.exec_am(&msg, data, &mut i, &lamellae).await;
-                let process_task = self.process_msg(return_am, executor.clone(), 0, false);
+                let process_task = self.process_msg(return_am, 0, false);
                 executor.submit_task(process_task);
             }
             Cmd::ReturnAm => {
@@ -232,7 +248,7 @@ impl ActiveMessageEngine for Arc<RegisteredActiveMessages> {
                     .await;
                 let am_tasks = futures::stream::FuturesUnordered::new();
                 for am in ams.into_iter() {
-                    am_tasks.push(self.clone().process_msg(am, executor.clone(), 0, false));
+                    am_tasks.push(self.clone().process_msg(am, 0, false));
                 }
                 executor.submit_task(futures::future::join_all(am_tasks));
             }
@@ -248,7 +264,7 @@ impl RegisteredActiveMessages {
 
     //#[tracing::instrument(skip_all)]
     async fn send_am(
-        self: &Arc<Self>,
+        &self,
         req_data: ReqMetaData,
         am: LamellarArcAm,
         am_id: AmId,
@@ -293,26 +309,11 @@ impl RegisteredActiveMessages {
             .await;
     }
 
-    //#[tracing::instrument(skip_all)]
-    async fn send_data_am(
-        self: &Arc<Self>,
-        req_data: ReqMetaData,
-        data: LamellarResultArc,
-        data_size: usize,
-    ) {
+    // #[tracing::instrument(skip_all)]
+    async fn send_data_am(&self, req_data: ReqMetaData, data: LamellarResultArc, data_size: usize) {
         // println!("send_data_am");
-        let header = self.create_header(&req_data, Cmd::Data);
-        let mut darcs = vec![];
-        data.ser(1, &mut darcs); //1 because we are only sending back to the original PE
-        let darc_list_size = crate::serialized_size(&darcs, false);
-        let data_header = DataHeader {
-            size: data_size,
-            req_id: req_data.id,
-            darc_list_size: darc_list_size,
-        };
 
         let data_buf = self
-            .create_data_buf(
                 header,
                 data_size + darc_list_size + *DATA_HEADER_LEN,
                 &req_data.lamellae,
@@ -333,8 +334,8 @@ impl RegisteredActiveMessages {
             .await;
     }
 
-    //#[tracing::instrument(skip_all)]
-    async fn send_unit_am(self: &Arc<Self>, req_data: ReqMetaData) {
+    // #[tracing::instrument(skip_all)]
+    async fn send_unit_am(&self, req_data: ReqMetaData) {
         // println!("send_unit_am");
 
         let header = self.create_header(&req_data, Cmd::Unit);
@@ -353,8 +354,8 @@ impl RegisteredActiveMessages {
             .await;
     }
 
-    //#[tracing::instrument(skip_all)]
-    fn create_header(self: &Arc<Self>, req_data: &ReqMetaData, cmd: Cmd) -> SerializeHeader {
+    // #[tracing::instrument(skip_all)]
+    fn create_header(&self, req_data: &ReqMetaData, cmd: Cmd) -> SerializeHeader {
         let msg = Msg {
             src: req_data.team.world_pe as u16,
             cmd: cmd,
@@ -364,7 +365,7 @@ impl RegisteredActiveMessages {
 
     //#[tracing::instrument(skip_all)]
     async fn create_data_buf(
-        self: &Arc<Self>,
+        &self,
         header: SerializeHeader,
         size: usize,
         lamellae: &Arc<Lamellae>,
@@ -389,7 +390,7 @@ impl RegisteredActiveMessages {
     #[async_recursion]
     //#[tracing::instrument(skip_all)]
     pub(crate) async fn exec_local_am(
-        self: Arc<Self>,
+        &self,
         req_data: ReqMetaData,
         am: LamellarArcLocalAm,
         world: Arc<LamellarTeam>,
@@ -407,7 +408,7 @@ impl RegisteredActiveMessages {
             .await
         {
             LamellarReturn::LocalData(data) => {
-                // println!("local am data return");
+                // println!("[{:?}] local am data return", std::thread::current().id());
                 self.send_data_to_user_handle(
                     req_data.id,
                     req_data.src,
@@ -415,13 +416,12 @@ impl RegisteredActiveMessages {
                 );
             }
             LamellarReturn::LocalAm(am) => {
-                // println!("local am am return");
-                self.clone()
-                    .exec_local_am(req_data, am.as_local(), world, team)
+                // println!("[{:?}] local am am return", std::thread::current().id());
+                self.exec_local_am(req_data, am.as_local(), world, team)
                     .await;
             }
             LamellarReturn::Unit => {
-                // println!("local am unit return");
+                // println!("[{:?}] local am unit return", std::thread::current().id());
                 self.send_data_to_user_handle(req_data.id, req_data.src, InternalResult::Unit);
             }
             LamellarReturn::RemoteData(_) | LamellarReturn::RemoteAm(_) => {
@@ -432,7 +432,7 @@ impl RegisteredActiveMessages {
 
     //#[tracing::instrument(skip_all)]
     pub(crate) async fn exec_am(
-        self: &Arc<Self>,
+        &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
@@ -481,7 +481,7 @@ impl RegisteredActiveMessages {
 
     //#[tracing::instrument(skip_all)]
     pub(crate) async fn exec_return_am(
-        self: &Arc<Self>,
+        &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
@@ -505,14 +505,13 @@ impl RegisteredActiveMessages {
             team: team.team.clone(),
             team_addr: team.team.remote_ptr_addr,
         };
-        self.clone()
-            .exec_local_am(req_data, am.as_local(), world, team)
+        self.exec_local_am(req_data, am.as_local(), world, team)
             .await;
     }
 
     //#[tracing::instrument(skip_all)]
     pub(crate) async fn exec_data_am(
-        self: &Arc<Self>,
+        &self,
         msg: &Msg,
         data_buf: &[u8],
         i: &mut usize,
@@ -537,8 +536,8 @@ impl RegisteredActiveMessages {
         );
     }
 
-    //#[tracing::instrument(skip_all)]
-    pub(crate) async fn exec_unit_am(self: &Arc<Self>, msg: &Msg, data: &[u8], i: &mut usize) {
+    // #[tracing::instrument(skip_all)]
+    pub(crate) async fn exec_unit_am(&self, msg: &Msg, data: &[u8], i: &mut usize) {
         // println!("exec_unit_am");
         let unit_header: UnitHeader =
             crate::deserialize(&data[*i..*i + *UNIT_HEADER_LEN], false).unwrap();
diff --git a/src/array.rs b/src/array.rs
index 41a58b79..66e4f7db 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -428,6 +428,12 @@ impl<T: Clone> TeamFrom<(&Vec<T>, Distribution)> for Vec<T> {
     }
 }
 
+impl<T: Clone> TeamFrom<(Vec<T>, Distribution)> for Vec<T> {
+    fn team_from(vals: (Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        vals.0.to_vec()
+    }
+}
+
 impl<T: Dist> TeamTryFrom<&T> for LamellarArrayRdmaInput<T> {
     fn team_try_from(val: &T, team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::team_from(val, team))
@@ -503,6 +509,20 @@ impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
     }
 }
 
+// #[async_trait]
+// impl<T: Clone> AsyncTeamFrom<(&Vec<T>, Distribution)> for Vec<T> {
+//     async fn team_from(vals: (&Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         vals.0.to_vec()
+//     }
+// }
+
+#[async_trait]
+impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for Vec<T> {
+    async fn team_from(input: (Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        input.0
+    }
+}
+
 #[async_trait]
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 /// and to be used within an async context
@@ -557,6 +577,7 @@ pub trait TeamFrom<T: ?Sized> {
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 /// and to be used within an async context
 pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> {
+    /// Converts to this type from the input type
     async fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
 }
 
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 90ec1cf4..adad0083 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -40,15 +40,13 @@ use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::{private::*, IterRequest, Schedule};
 use crate::array::{
     operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray,
-    LamellarArray, LamellarArrayPut, NativeAtomicArray, TeamFrom, UnsafeArray,
+    LamellarArray, NativeAtomicArray,
 };
-use crate::lamellar_request::LamellarRequest;
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
 use crate::active_messaging::SyncSend;
 
-use async_trait::async_trait;
 use enum_dispatch::enum_dispatch;
 use futures::Future;
 use std::marker::PhantomData;
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index b38d6e15..85a6b800 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
 use crate::array::iterator::{private::*, IterRequest};
 use crate::array::operations::ArrayOps;
-use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 4dea5332..05897c92 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
 use crate::array::iterator::{private::*, IterRequest};
 use crate::array::operations::ArrayOps;
-use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamFrom, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index e510b9ff..f690a9fa 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -273,76 +273,6 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         // println!("after buffered ops");
         // array.inner.data.print();
     }
-
-    async fn async_new<U: Into<IntoLamellarTeam>>(
-        team: U,
-        array_size: usize,
-        distribution: Distribution,
-    ) -> UnsafeArray<T> {
-        let team = team.into().team.clone();
-        team.async_barrier().await;
-        let task_group = LamellarTaskGroup::new(team.clone());
-        let my_pe = team.team_pe_id().unwrap();
-        let num_pes = team.num_pes();
-        let full_array_size = std::cmp::max(array_size, num_pes);
-
-        let elem_per_pe = full_array_size as f64 / num_pes as f64;
-        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
-                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
-        let rmr = MemoryRegion::new(
-            per_pe_size * std::mem::size_of::<T>(),
-            team.lamellae.clone(),
-            AllocationType::Global,
-        );
-        unsafe {
-            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
-                *elem = 0;
-            }
-        }
-
-        let data = Darc::try_new_with_drop(
-            team.clone(),
-            UnsafeArrayData {
-                mem_region: rmr,
-                array_counters: Arc::new(AMCounters::new()),
-                team: team.clone(),
-                task_group: Arc::new(task_group),
-                my_pe: my_pe,
-                num_pes: num_pes,
-                req_cnt: Arc::new(AtomicUsize::new(0)),
-            },
-            crate::darc::DarcMode::UnsafeArray,
-            None,
-        )
-        .expect("trying to create array on non team member");
-        let array = UnsafeArray {
-            inner: UnsafeArrayInner {
-                data: data,
-                distribution: distribution.clone(),
-                // wait: wait,
-                orig_elem_per_pe: elem_per_pe,
-                elem_size: std::mem::size_of::<T>(),
-                offset: 0,             //relative to size of T
-                size: full_array_size, //relative to size of T
-            },
-            phantom: PhantomData,
-        };
-        // println!("new unsafe");
-        // unsafe {println!("size {:?} bytes {:?}",array.inner.size, array.inner.data.mem_region.as_mut_slice().unwrap().len())};
-        // println!("elem per pe {:?}", elem_per_pe);
-        // for i in 0..num_pes{
-        //     println!("pe: {:?} {:?}",i,array.inner.num_elems_pe(i));
-        // }
-        // array.inner.data.print();
-        if full_array_size != array_size {
-            println!("WARNING: Array size {array_size} is less than number of pes {full_array_size}, each PE will not contain data");
-            array.sub_array(0..array_size)
-        } else {
-            array
-        }
-        // println!("after buffered ops");
-        // array.inner.data.print();
-    }
 }
 impl<T: Dist + 'static> UnsafeArray<T> {
     #[doc(alias("One-sided", "onesided"))]
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 749071bd..ae845049 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -2,7 +2,7 @@ use crate::active_messaging::SyncSend;
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::private::*;
 use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, AsyncTeamFrom, AsyncTeamInto, Distribution, LamellarArray, TeamFrom};
+use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, LamellarArray};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index b71be633..23b066bd 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -395,7 +395,6 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let index_vec = index.to_vec();
             let the_array: UnsafeArray<T> = self.clone();
             // println!("num_reqs {:?}",num_reqs);
-            let the_array: UnsafeArray<T> = self.clone();
             self.inner
                 .data
                 .team
diff --git a/src/darc.rs b/src/darc.rs
index 1b7cb4b7..454c91fb 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1263,46 +1263,6 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
         let block_on_fut =
             { DarcInner::block_on_outstanding(wrapped.clone(), DarcMode::Dropped, 0) };
         block_on_fut.await;
-        // wrapped.wait_all();
-        // // let inner = unsafe {&*wrapped.inner}; //we dont actually care about the "type" we wrap here, we just need access to the meta data for the darc (but still allow async wait cause T is not send)
-        // while wrapped.dist_cnt.load(Ordering::SeqCst) != 0
-        //     || wrapped.local_cnt.load(Ordering::SeqCst) != 0
-        // {
-        //     if wrapped.local_cnt.load(Ordering::SeqCst) == 0 {
-        //         wrapped.send_finished();
-        //     }
-        //     if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
-        //         let ref_cnts_slice = std::slice::from_raw_parts_mut(
-        //             wrapped.ref_cnt_addr as *mut usize,
-        //             wrapped.num_pes,
-        //         );
-
-        //         println!("[WARNING] - Potential deadlock detected when trying to free distributed object.\n\
-        //             The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
-        //             The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
-        //             the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
-        //             To view backtrace set RUST_LIB_BACKTRACE=1\n\
-        //             {}",
-        //             mode_refs,
-        //             wrapped.local_cnt.load(Ordering::SeqCst),
-        //             wrapped.dist_cnt.load(Ordering::SeqCst),
-        //             *crate::DEADLOCK_TIMEOUT,
-        //             std::backtrace::Backtrace::capture()
-        //         );
-        //         timeout = std::time::Instant::now();
-        //     }
-        //     async_std::task::yield_now().await;
-        // }
-        // // let team = wrapped.team();
-        // let rdma = &self.team.lamellae;
-        // for pe in self.team.arch.team_iter() {
-        //     // println!("putting {:?} to {:?} @ {:x}",&mode_refs[self.my_pe..=self.my_pe],pe,self.mode_addr + self.my_pe * std::mem::size_of::<u8>());
-        //     rdma.put(
-        //         pe,
-        //         &mode_refs_u8[self.my_pe..=self.my_pe],
-        //         self.mode_addr + self.my_pe * std::mem::size_of::<DarcMode>(),
-        //     );
-        // }
 
         // println!(
         //     "[{:?}] past block_on_outstanding {:x}",
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index b49c5a0f..5c812980 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -82,6 +82,7 @@ impl LamellarRequestResult {
         }
 
         self.req.update_counters();
+
         added
     }
 }
@@ -133,7 +134,8 @@ impl LamellarRequestAddResult for LamellarRequestHandleInner {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         // println!(
-        //     "update counter team {} world {}",
+        //     "[{:?}] update counter team {} world {}",
+        //     std::thread::current().id(),
         //     _team_reqs - 1,
         //     _world_req - 1
         // );
@@ -255,7 +257,12 @@ impl LamellarRequestAddResult for LamellarMultiRequestHandleInner {
         // );
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!("update counter team {} world {}",_team_reqs-1,_world_req-1);
+        // println!(
+        //     "[{:?}] multi update counter team {} world {}",
+        //     std::thread::current().id(),
+        //     _team_reqs - 1,
+        //     _world_req - 1
+        // );
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         }
@@ -395,7 +402,12 @@ impl LamellarRequestAddResult for LamellarLocalRequestHandleInner {
     fn update_counters(&self) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!("update counter team {} world {}",_team_reqs-1,_world_req-1);
+        // println!(
+        //     "[{:?}] local update counter team {} world {}",
+        //     std::thread::current().id(),
+        //     _team_reqs - 1,
+        //     _world_req - 1
+        // );
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         }
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 97686819..a8233733 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -375,27 +375,38 @@ impl LamellarWorldBuilder {
     pub fn new() -> LamellarWorldBuilder {
         // simple_logger::init().unwrap();
         // trace!("New world builder");
-        let mut executor = match std::env::var("LAMELLAR_EXECUTOR") {
+        let executor = match std::env::var("LAMELLAR_EXECUTOR") {
             Ok(val) => {
                 let executor = val.parse::<usize>().unwrap();
                 if executor == 0 {
                     ExecutorType::LamellarWorkStealing
+                } else if executor == 1 {
+                    #[cfg(feature = "tokio-executor")]
+                    {
+                        ExecutorType::Tokio
+                    }
+                    #[cfg(not(feature = "tokio-executor"))]
+                    {
+                        println!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled,  defaulting to lamellar work stealing executor");
+                        ExecutorType::LamellarWorkStealing
+                    }
+                } else {
+                    println!("[LAMELLAR WARNING]: invalid executor selected defaulting to lamellar work stealing executor");
+                    ExecutorType::LamellarWorkStealing
                 }
-                // else if scheduler == 1 {
-                //     ExecutorType::NumaWorkStealing
-                // } else if scheduler == 2 {
-                //     ExecutorType::NumaWorkStealing2
-                // }
-                else {
+            }
+            Err(_) => {
+                #[cfg(feature = "tokio-executor")]
+                {
+                    ExecutorType::Tokio
+                }
+                #[cfg(not(feature = "tokio-executor"))]
+                {
                     ExecutorType::LamellarWorkStealing
                 }
             }
-            Err(_) => ExecutorType::LamellarWorkStealing,
         };
-        #[cfg(feature = "tokio-executor")]
-        {
-            executor = ExecutorType::Tokio;
-        }
+        println!("executor: {:?}", executor);
 
         let num_threads = match std::env::var("LAMELLAR_THREADS") {
             Ok(n) => {
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 0ba82e24..72e422d5 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -51,10 +51,16 @@ pub(crate) struct ReqId {
     pub(crate) sub_id: usize,
 }
 
+/// Indicates the executor backend
+/// Default is a work stealing executor
+/// If the "tokio-executor" feature is enabled,the tokio executor can also be used
+/// allowing seemless integration with tokio based applications
 #[derive(Debug)]
 pub enum ExecutorType {
+    /// The default work stealing executor
     LamellarWorkStealing,
     #[cfg(feature = "tokio-executor")]
+    /// The tokio executor
     Tokio,
     // Dyn(impl LamellarExecutor),
 }
@@ -132,13 +138,18 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
-        let executor = self.executor.clone();
         let am_future = async move {
             num_ams.fetch_add(1, Ordering::Relaxed);
-            max_ams.fetch_add(1, Ordering::Relaxed);
+            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, executor, am_stall_mark, false).await;
+            // println!("[{:?}] submit_am {:?}", std::thread::current().id(), am_id);
+            ame.process_msg(am, am_stall_mark, false).await;
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] submit_am_done {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
         };
         self.executor.submit_task(am_future);
@@ -150,13 +161,22 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
-        let executor = self.executor.clone();
         let am_future = async move {
             num_ams.fetch_add(1, Ordering::Relaxed);
-            max_ams.fetch_add(1, Ordering::Relaxed);
+            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
-            ame.process_msg(am, executor, am_stall_mark, false).await;
+            // println!(
+            //     "[{:?}] submit_am_immediate {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
+            ame.process_msg(am, am_stall_mark, false).await;
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] submit_am_immediate done {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
         };
         self.executor.submit_immediate_task(am_future);
@@ -169,8 +189,13 @@ impl Scheduler {
         let executor = self.executor.clone();
         let am_future = async move {
             num_ams.fetch_add(1, Ordering::Relaxed);
-            max_ams.fetch_add(1, Ordering::Relaxed);
+            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            // println!(
+            //     "[{:?}] submit_remote_am {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             if let Some(header) = data.deserialize_header() {
                 let msg = header.msg;
                 ame.exec_msg(msg, data, lamellae, executor).await;
@@ -179,6 +204,11 @@ impl Scheduler {
                 panic!("should i be here?");
             }
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] submit_remote_am done {:?}",
+            //     std::thread::current().id(),
+            //     am_id
+            // );
             // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
         };
         self.executor.submit_task(am_future);
@@ -192,9 +222,19 @@ impl Scheduler {
         let max_tasks = self.max_tasks.clone();
         let future = async move {
             num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
+            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] execing new task {:?}",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
             task.await;
             num_tasks.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] done new task {:?} ",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
         };
         self.executor.submit_task(future);
     }
@@ -207,9 +247,19 @@ impl Scheduler {
         let max_tasks = self.max_tasks.clone();
         let future = async move {
             num_tasks.fetch_add(1, Ordering::Relaxed);
-            max_tasks.fetch_add(1, Ordering::Relaxed);
+            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] execing new task immediate {:?}",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
             task.await;
             num_tasks.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] done new task immediate {:?} ",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
         };
         self.executor.submit_immediate_task(future);
     }
diff --git a/src/scheduler/numa_work_stealing.rs b/src/scheduler/numa_work_stealing.rs
deleted file mode 100644
index c2f5a043..00000000
--- a/src/scheduler/numa_work_stealing.rs
+++ /dev/null
@@ -1,552 +0,0 @@
-use crate::active_messaging::{ActiveMessageEngine, ActiveMessageEngineType, Am};
-use crate::lamellae::{Des, Lamellae, SerializedData};
-use crate::scheduler::batching::simple_batcher::SimpleBatcher;
-use crate::scheduler::batching::team_am_batcher::TeamAmBatcher;
-use crate::scheduler::batching::BatcherType;
-use crate::scheduler::registered_active_message::RegisteredActiveMessages;
-use crate::scheduler::{AmeScheduler, AmeSchedulerQueue, SchedulerQueue};
-// use log::trace;
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-// use parking_lot::RwLock;
-use rand::prelude::*;
-// use std::collections::HashMap;
-use std::collections::HashMap;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-use thread_local::ThreadLocal;
-// use std::time::Instant;
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealingThread {
-    node_work_inj: Arc<crossbeam::deque::Injector<async_task::Runnable>>,
-    _sys_work_inj: Vec<Arc<crossbeam::deque::Injector<async_task::Runnable>>>,
-    node_work_stealers: Vec<crossbeam::deque::Stealer<async_task::Runnable>>,
-    _sys_work_stealers: HashMap<usize, Vec<crossbeam::deque::Stealer<async_task::Runnable>>>,
-    work_q: Worker<async_task::Runnable>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-}
-
-impl NumaWorkStealingThread {
-    fn run(
-        worker: NumaWorkStealingThread,
-        active_cnt: Arc<AtomicUsize>,
-        num_tasks: Arc<AtomicUsize>,
-        id: CoreId,
-    ) -> thread::JoinHandle<()> {
-        thread::spawn(move || {
-            // println!("TestSchdulerWorker thread running");
-            core_affinity::set_for_current(id);
-            active_cnt.fetch_add(1, Ordering::SeqCst);
-            let mut rng = rand::thread_rng();
-            let t = rand::distributions::Uniform::from(0..worker.node_work_stealers.len());
-            let mut timer = std::time::Instant::now();
-            // let mut cur_tasks = num_tasks.load(Ordering::SeqCst);
-            while worker.active.load(Ordering::SeqCst)
-                || !(worker.work_q.is_empty() && worker.node_work_inj.is_empty())
-                || num_tasks.load(Ordering::SeqCst) > 1
-            {
-                // let ot = Instant::now();
-                // if cur_tasks != num_tasks.load(Ordering::SeqCst){
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     cur_tasks = num_tasks.load(Ordering::SeqCst);
-
-                // }
-                let omsg = worker.work_q.pop().or_else(|| {
-                    if worker
-                        .work_flag
-                        .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-                        == Ok(0)
-                    {
-                        let ret = worker
-                            .node_work_inj
-                            .steal_batch_and_pop(&worker.work_q)
-                            .success();
-                        worker.work_flag.store(0, Ordering::SeqCst);
-                        ret
-                    } else {
-                        worker.node_work_stealers[t.sample(&mut rng)]
-                            .steal()
-                            .success()
-                    }
-                });
-                if let Some(runnable) = omsg {
-                    if !worker.active.load(Ordering::SeqCst) && timer.elapsed().as_secs_f64() > 60.0
-                    {
-                        println!("runnable {:?}", runnable);
-                        println!(
-                            "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                            worker.work_q.len(),
-                            worker.node_work_inj.len(),
-                            num_tasks.load(Ordering::SeqCst)
-                        );
-                        timer = std::time::Instant::now();
-                    }
-                    runnable.run();
-                }
-                if !worker.active.load(Ordering::SeqCst)
-                    && timer.elapsed().as_secs_f64() > 60.0
-                    && (worker.work_q.len() > 0 || worker.node_work_inj.len() > 0)
-                {
-                    println!(
-                        "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                        worker.work_q.len(),
-                        worker.node_work_inj.len(),
-                        num_tasks.load(Ordering::SeqCst)
-                    );
-                    timer = std::time::Instant::now();
-                }
-                // if timer.elapsed().as_secs_f64() > 60.0 {
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.node_work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     timer = std::time::Instant::now()
-                // }
-            }
-            active_cnt.fetch_sub(1, Ordering::SeqCst);
-            // println!("TestSchdulerWorker thread shutting down");
-        })
-    }
-}
-
-/*
-create a work injector and stealer for each numa node,
-additionally create a threadlocal counter that each thread will use to index
-into the to appropriate work injector when submitting work
-*/
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealingInner {
-    threads: Vec<thread::JoinHandle<()>>,
-    work_inj: Vec<Arc<crossbeam::deque::Injector<async_task::Runnable>>>,
-    work_stealers: HashMap<usize, Vec<crossbeam::deque::Stealer<async_task::Runnable>>>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-    active_cnt: Arc<AtomicUsize>,
-    num_tasks: Arc<AtomicUsize>,
-    stall_mark: Arc<AtomicUsize>,
-    local_work_inj: ThreadLocal<AtomicUsize>,
-    nodes: Vec<usize>,
-}
-
-impl AmeSchedulerQueue for NumaWorkStealingInner {
-    fn submit_am(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        let num_tasks = self.num_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = async move {
-            // println!("exec req {:?}",num_tasks.load(Ordering::Relaxed));
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("in submit_req {:?} {:?} {:?} ", pe.clone(), req_data.src, req_data.pe);
-            ame.process_msg(am, scheduler, stall_mark).await;
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done req {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    //this is a serialized request
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        data: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ) {
-        // let work_inj = self.work_inj.clone();
-        // println!("submit work {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future = async move {
-            // println!("exec work {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            if let Some(header) = data.deserialize_header() {
-                let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, scheduler).await;
-            } else {
-                data.print();
-                panic!("should i be here?");
-            }
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done work {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        // println!("submit task {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future2 = async move {
-            // println!("exec task {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done task {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn block_on<F: Future>(&self, future: F) -> F::Output {
-        let work_inj = self.work_inj[self
-            .local_work_inj
-            .get_or(|| AtomicUsize::new(0))
-            .fetch_add(1, Ordering::SeqCst)
-            % self.work_inj.len()]
-        .clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, mut task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let waker = runnable.waker();
-        runnable.schedule();
-        while !task.is_finished() {
-            self.exec_task();
-        }
-        let cx = &mut async_std::task::Context::from_waker(&waker);
-        if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-            output
-        } else {
-            panic!("task not ready");
-        }
-    }
-
-    fn shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(false, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
-        while self.active_cnt.load(Ordering::Relaxed) > 2
-            || self.num_tasks.load(Ordering::Relaxed) > 2
-        {
-            //this should be the recvtask, and alloc_task
-            std::thread::yield_now()
-        }
-        // println!(
-        //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
-        //     self.active_cnt.load(Ordering::Relaxed),
-        //     self.active_cnt.load(Ordering::Relaxed)
-        // );
-    }
-
-    fn exec_task(&self) {
-        let mut rng = rand::thread_rng();
-        // let c = rand::distributions::Uniform::from(0..self.work_stealers.len());
-        // let c = rand::distributions::Uniform::from
-        let ret = if self
-            .work_flag
-            .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-            == Ok(0)
-        {
-            let ret = self
-                .nodes
-                .choose_multiple(&mut rng, self.nodes.len())
-                .find_map(|node| self.work_inj[*node % self.nodes.len()].steal().success());
-            self.work_flag.store(0, Ordering::SeqCst);
-            ret
-        } else {
-            self.nodes
-                .choose_multiple(&mut rng, self.nodes.len())
-                .find_map(|node| {
-                    self.work_stealers[node]
-                        .choose(&mut rng)
-                        .unwrap()
-                        .steal()
-                        .success()
-                })
-        };
-        if let Some(runnable) = ret {
-            runnable.run();
-        }
-    }
-
-    fn active(&self) -> bool {
-        // println!("sched active {:?} {:?}",self.active.load(Ordering::SeqCst) , self.num_tasks.load(Ordering::SeqCst));
-        self.active.load(Ordering::SeqCst) || self.num_tasks.load(Ordering::SeqCst) > 2
-    }
-}
-
-impl SchedulerQueue for NumaWorkStealing {
-    fn submit_am(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        self.inner.submit_am(self, self.ame.clone(), am);
-    }
-
-    // fn submit_return(&self, src, pe)
-
-    fn submit_work(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
-        self.inner
-            .submit_work(self, self.ame.clone(), data, lamellae);
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn exec_task(&self) {
-        self.inner.exec_task();
-    }
-
-    fn submit_task_node<F>(&self, future: F, _node: usize)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inner.submit_task(future);
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        self.inner.block_on(future)
-    }
-
-    fn shutdown(&self) {
-        self.inner.shutdown();
-    }
-    fn active(&self) -> bool {
-        self.inner.active()
-    }
-}
-
-impl NumaWorkStealingInner {
-    pub(crate) fn new(stall_mark: Arc<AtomicUsize>) -> NumaWorkStealingInner {
-        // println!("new work stealing queue");
-
-        let mut sched = NumaWorkStealingInner {
-            threads: Vec::new(),
-            work_inj: Vec::new(), //Arc::new(crossbeam::deque::Injector::new()),
-            work_stealers: HashMap::new(), //Vec::new(),
-            work_flag: Arc::new(AtomicU8::new(0)),
-            active: Arc::new(AtomicBool::new(true)),
-            active_cnt: Arc::new(AtomicUsize::new(0)),
-            num_tasks: Arc::new(AtomicUsize::new(0)),
-            stall_mark: stall_mark,
-            local_work_inj: ThreadLocal::new(),
-            nodes: Vec::new(),
-        };
-        sched.local_work_inj.get_or(|| AtomicUsize::new(0));
-        sched.init();
-        sched
-    }
-
-    fn init(&mut self) {
-        let num_workers = match std::env::var("LAMELLAR_THREADS") {
-            Ok(n) => n.parse::<usize>().unwrap() - 1,
-            Err(_) => 4,
-        };
-        let core_ids = core_affinity::get_core_ids().unwrap();
-        println!("core_ids: {:?}", core_ids);
-        let mut node_to_cores: HashMap<usize, Vec<usize>> = HashMap::new();
-        let mut core_to_node: HashMap<usize, usize> = HashMap::new();
-
-        let mut cur_worker_cnt = 0;
-
-        if let Ok(nodes) = glob::glob("/sys/devices/system/node/node*") {
-            for node in nodes {
-                if let Ok(node_path) = node {
-                    if let Some(node) = format!("{}", node_path.display()).split("/").last() {
-                        if let Some(node) = node.strip_prefix("node") {
-                            if let Ok(node) = node.parse::<usize>() {
-                                if let Ok(cpus) =
-                                    glob::glob(&format!("{}/cpu*", node_path.display()))
-                                {
-                                    let mut cores = Vec::new();
-                                    for cpu in cpus {
-                                        if let Ok(cpu) = cpu {
-                                            if let Some(cpu) =
-                                                format!("{}", cpu.display()).split("/").last()
-                                            {
-                                                if let Some(cpu) = cpu.strip_prefix("cpu") {
-                                                    if let Ok(cpu) = cpu.parse::<usize>() {
-                                                        for core_id in core_ids.iter() {
-                                                            if core_id.id == cpu {
-                                                                core_to_node.insert(cpu, node);
-                                                                cores.push(cpu);
-                                                                cur_worker_cnt += 1;
-                                                            }
-                                                            if cur_worker_cnt >= num_workers {
-                                                                break;
-                                                            }
-                                                        }
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                    if cores.len() > 0 {
-                                        node_to_cores.insert(node, cores);
-                                    }
-                                    if cur_worker_cnt >= num_workers {
-                                        break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        println!("node_to_cores {:?}", node_to_cores);
-        println!("core_to_node {:?}", core_to_node);
-
-        let mut work_workers = HashMap::new();
-        for (node, cores) in &node_to_cores {
-            let mut node_work_workers: std::vec::Vec<
-                crossbeam::deque::Worker<async_task::Runnable>,
-            > = vec![];
-            let mut node_work_stealers = vec![];
-            for _core in cores {
-                let core_work_worker: crossbeam::deque::Worker<async_task::Runnable> =
-                    crossbeam::deque::Worker::new_fifo();
-                node_work_stealers.push(core_work_worker.stealer());
-                node_work_workers.push(core_work_worker);
-            }
-            self.work_inj
-                .push(Arc::new(crossbeam::deque::Injector::new()));
-            self.work_stealers.insert(*node, node_work_stealers);
-            work_workers.insert(node, node_work_workers);
-            self.nodes.push(*node);
-        }
-
-        let orig_hook = panic::take_hook();
-        panic::set_hook(Box::new(move |panic_info| {
-            // invoke the default handler and exit the process
-            orig_hook(panic_info);
-            process::exit(1);
-        }));
-
-        let mut inj = 0;
-        for (node, cores) in &node_to_cores {
-            let node_work_workers = work_workers.get_mut(&node).unwrap();
-            for core in cores {
-                let core_work_worker = node_work_workers.pop().unwrap();
-                let worker = NumaWorkStealingThread {
-                    node_work_inj: self.work_inj[inj].clone(),
-                    _sys_work_inj: self.work_inj.clone(),
-                    node_work_stealers: self.work_stealers.get(&node).unwrap().clone(),
-                    _sys_work_stealers: self.work_stealers.clone(),
-                    work_q: core_work_worker,
-                    work_flag: self.work_flag.clone(),
-                    active: self.active.clone(),
-                };
-                self.threads.push(NumaWorkStealingThread::run(
-                    worker,
-                    self.active_cnt.clone(),
-                    self.num_tasks.clone(),
-                    CoreId { id: *core },
-                ));
-            }
-            inj += 1;
-        }
-
-        while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
-            std::thread::yield_now();
-        }
-    }
-}
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing {
-    inner: &(impl SchedulerQueue + Sync + std::fmt::Debug),
-    ame: Arc<ActiveMessageEngineType>,
-}
-impl NumaWorkStealing {
-    pub(crate) fn new(
-        num_pes: usize,
-        // my_pe: usize,
-        // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
-    ) -> NumaWorkStealing {
-        // println!("new work stealing queue");
-        let stall_mark = Arc::new(AtomicUsize::new(0));
-        let inner = Arc::new(AmeScheduler::NumaWorkStealingInner(
-            NumaWorkStealingInner::new(stall_mark.clone()),
-        ));
-        let batcher = match std::env::var("LAMELLAR_BATCHER") {
-            Ok(n) => {
-                let n = n.parse::<usize>().unwrap();
-                if n == 1 {
-                    BatcherType::Simple(SimpleBatcher::new(num_pes, stall_mark.clone()))
-                } else {
-                    BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone()))
-                }
-            }
-            Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone())),
-        };
-        let sched = NumaWorkStealing {
-            inner: inner.clone(),
-            ame: Arc::new(ActiveMessageEngineType::RegisteredActiveMessages(
-                RegisteredActiveMessages::new(batcher),
-            )),
-        };
-        sched
-    }
-}
-
-impl Drop for NumaWorkStealingInner {
-    //when is this called with respect to world?
-    fn drop(&mut self) {
-        // println!("dropping work stealing");
-        while let Some(thread) = self.threads.pop() {
-            if thread.thread().id() != std::thread::current().id() {
-                let _res = thread.join();
-            }
-        }
-        for val in self.local_work_inj.iter_mut() {
-            println!("local_work_inj {:?}", val.load(Ordering::SeqCst));
-        }
-        // println!("NumaWorkStealing Scheduler Dropped");
-    }
-}
diff --git a/src/scheduler/numa_work_stealing2.rs b/src/scheduler/numa_work_stealing2.rs
deleted file mode 100644
index ec82c3ef..00000000
--- a/src/scheduler/numa_work_stealing2.rs
+++ /dev/null
@@ -1,569 +0,0 @@
-use crate::active_messaging::{ActiveMessageEngine, ActiveMessageEngineType, Am};
-use crate::lamellae::{Des, Lamellae, SerializedData};
-use crate::scheduler::batching::simple_batcher::SimpleBatcher;
-use crate::scheduler::batching::team_am_batcher::TeamAmBatcher;
-use crate::scheduler::batching::BatcherType;
-use crate::scheduler::registered_active_message::RegisteredActiveMessages;
-use crate::scheduler::{AmeScheduler, AmeSchedulerQueue, SchedulerQueue};
-// use log::trace;
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
-// use parking_lot::RwLock;
-use rand::prelude::*;
-use std::collections::HashMap;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-// use thread_local::ThreadLocal;
-// use std::time::Instant;
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing2Thread {
-    work_inj: Arc<crossbeam::deque::Injector<async_task::Runnable>>,
-    work_stealers: Vec<crossbeam::deque::Stealer<async_task::Runnable>>,
-    work_q: Worker<async_task::Runnable>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-}
-
-impl NumaWorkStealing2Thread {
-    fn run(
-        worker: NumaWorkStealing2Thread,
-        active_cnt: Arc<AtomicUsize>,
-        num_tasks: Arc<AtomicUsize>,
-        id: CoreId,
-    ) -> thread::JoinHandle<()> {
-        thread::spawn(move || {
-            // println!("TestSchdulerWorker thread running");
-            core_affinity::set_for_current(id);
-            active_cnt.fetch_add(1, Ordering::SeqCst);
-            let mut rng = rand::thread_rng();
-            let t = rand::distributions::Uniform::from(0..worker.work_stealers.len());
-            let mut timer = std::time::Instant::now();
-            // let mut cur_tasks = num_tasks.load(Ordering::SeqCst);
-            while worker.active.load(Ordering::SeqCst)
-                || !(worker.work_q.is_empty() && worker.work_inj.is_empty())
-                || num_tasks.load(Ordering::SeqCst) > 1
-            {
-                // let ot = Instant::now();
-                // if cur_tasks != num_tasks.load(Ordering::SeqCst){
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     cur_tasks = num_tasks.load(Ordering::SeqCst);
-
-                // }
-                let omsg = worker.work_q.pop().or_else(|| {
-                    if worker
-                        .work_flag
-                        .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-                        == Ok(0)
-                    {
-                        let ret = worker
-                            .work_inj
-                            .steal_batch_and_pop(&worker.work_q)
-                            .success();
-                        worker.work_flag.store(0, Ordering::SeqCst);
-                        ret
-                    } else {
-                        worker.work_stealers[t.sample(&mut rng)].steal().success()
-                    }
-                });
-                if let Some(runnable) = omsg {
-                    if !worker.active.load(Ordering::SeqCst) && timer.elapsed().as_secs_f64() > 60.0
-                    {
-                        println!("runnable {:?}", runnable);
-                        println!(
-                            "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                            worker.work_q.len(),
-                            worker.work_inj.len(),
-                            num_tasks.load(Ordering::SeqCst)
-                        );
-                        timer = std::time::Instant::now();
-                    }
-                    runnable.run();
-                }
-                if !worker.active.load(Ordering::SeqCst)
-                    && timer.elapsed().as_secs_f64() > 60.0
-                    && (worker.work_q.len() > 0 || worker.work_inj.len() > 0)
-                {
-                    println!(
-                        "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                        worker.work_q.len(),
-                        worker.work_inj.len(),
-                        num_tasks.load(Ordering::SeqCst)
-                    );
-                    timer = std::time::Instant::now();
-                }
-                // if timer.elapsed().as_secs_f64() > 60.0 {
-                //     println!(
-                //         "work_q size {:?} work inj size {:?} num_tasks {:?}",
-                //         worker.work_q.len(),
-                //         worker.work_inj.len(),
-                //         num_tasks.load(Ordering::SeqCst)
-                //     );
-                //     timer = std::time::Instant::now()
-                // }
-            }
-            active_cnt.fetch_sub(1, Ordering::SeqCst);
-            // println!("TestSchdulerWorker thread shutting down");
-        })
-    }
-}
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing2Inner {
-    threads: Vec<thread::JoinHandle<()>>,
-    work_inj: Arc<crossbeam::deque::Injector<async_task::Runnable>>,
-    work_stealers: Vec<crossbeam::deque::Stealer<async_task::Runnable>>,
-    work_flag: Arc<AtomicU8>,
-    active: Arc<AtomicBool>,
-    active_cnt: Arc<AtomicUsize>,
-    num_tasks: Arc<AtomicUsize>,
-    stall_mark: Arc<AtomicUsize>,
-}
-
-impl AmeSchedulerQueue for NumaWorkStealing2Inner {
-    fn submit_am(
-        //unserialized request
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        am: Am,
-    ) {
-        let num_tasks = self.num_tasks.clone();
-        let stall_mark = self.stall_mark.fetch_add(1, Ordering::Relaxed);
-        let future = async move {
-            // println!("exec req {:?}",num_tasks.load(Ordering::Relaxed));
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            // println!("in submit_req {:?} {:?} {:?} ", pe.clone(), req_data.src, req_data.pe);
-            ame.process_msg(am, scheduler, stall_mark).await;
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done req {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    //this is a serialized request
-    fn submit_work(
-        &self,
-        scheduler: impl SchedulerQueue + Sync + Send + Clone + std::fmt::Debug + 'static,
-        ame: Arc<ActiveMessageEngineType>,
-        data: SerializedData,
-        lamellae: Arc<Lamellae>,
-    ) {
-        // let work_inj = self.work_inj.clone();
-        // println!("submit work {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future = async move {
-            // println!("exec work {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            if let Some(header) = data.deserialize_header() {
-                let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, scheduler).await;
-            } else {
-                data.print();
-                panic!("should i be here?");
-            }
-            // println!("num tasks: {:?}",);
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done work {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe as contents are sync+send, and no borrowed variables
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        // println!("submit task {:?}",self.num_tasks.load(Ordering::Relaxed));
-        let num_tasks = self.num_tasks.clone();
-        let future2 = async move {
-            // println!("exec task {:?}",num_tasks.load(Ordering::Relaxed)+1);
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            future.await;
-            num_tasks.fetch_sub(1, Ordering::Relaxed);
-            // println!("done task {:?}",num_tasks.load(Ordering::Relaxed));
-        };
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, task) = unsafe { async_task::spawn_unchecked(future2, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        runnable.schedule();
-        task.detach();
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        let work_inj = self.work_inj.clone();
-        let schedule = move |runnable| work_inj.push(runnable);
-        let (runnable, mut task) = unsafe { async_task::spawn_unchecked(future, schedule) }; //safe //safe as contents are sync+send... may need to do something to enforce lifetime bounds
-        let waker = runnable.waker();
-        runnable.schedule();
-        while !task.is_finished() {
-            self.exec_task();
-        }
-        let cx = &mut async_std::task::Context::from_waker(&waker);
-        if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-            output
-        } else {
-            panic!("task not ready");
-        }
-    }
-
-    fn shutdown(&self) {
-        // println!("work stealing shuting down {:?}", self.active());
-        self.active.store(false, Ordering::SeqCst);
-        // println!("work stealing shuting down {:?}",self.active());
-        while self.active_cnt.load(Ordering::Relaxed) > 2
-            || self.num_tasks.load(Ordering::Relaxed) > 2
-        {
-            //this should be the recvtask, and alloc_task
-            std::thread::yield_now()
-        }
-        // println!(
-        //     "work stealing shut down {:?} {:?} {:?}",
-        //     self.active(),
-        //     self.active_cnt.load(Ordering::Relaxed),
-        //     self.active_cnt.load(Ordering::Relaxed)
-        // );
-    }
-
-    fn exec_task(&self) {
-        let mut rng = rand::thread_rng();
-        let t = rand::distributions::Uniform::from(0..self.work_stealers.len());
-        let ret = if self
-            .work_flag
-            .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
-            == Ok(0)
-        {
-            let ret = self.work_inj.steal().success();
-            self.work_flag.store(0, Ordering::SeqCst);
-            ret
-        } else {
-            self.work_stealers[t.sample(&mut rng)].steal().success()
-        };
-        if let Some(runnable) = ret {
-            runnable.run();
-        }
-    }
-
-    fn active(&self) -> bool {
-        // println!("sched active {:?} {:?}",self.active.load(Ordering::SeqCst) , self.num_tasks.load(Ordering::SeqCst));
-        self.active.load(Ordering::SeqCst) || self.num_tasks.load(Ordering::SeqCst) > 2
-    }
-}
-
-impl SchedulerQueue for NumaWorkStealing2 {
-    fn submit_am(
-        //unserialized request
-        &self,
-        am: Am,
-    ) {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-
-        self.inners[node].submit_am(self, self.ames[node].clone(), am);
-    }
-
-    // fn submit_return(&self, src, pe)
-
-    fn submit_work(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
-        // let node = if let Some(header) = data.deserialize_header() {
-        //     let msg = header.msg;
-        //     if let ExecType::Am(cmd) = msg.cmd.clone() {
-        //         match cmd {
-        //             Cmd::BatchedDataReturn | Cmd::BatchedAmReturn => {
-        //                 println!(
-        //                     "got batched return {:x} {:x}",
-        //                     msg.req_id.id,
-        //                     msg.req_id.id & self.node_mask
-        //                 );
-        //                 msg.req_id.id & self.node_mask
-        //             }
-        //             _ => CUR_NODE
-        //                 .with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask),
-        //         }
-        //     } else {
-        //         CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask)
-        //     }
-        // } else {
-        //     CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask)
-        // };
-        // println!("submit work {:?}", node);
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].submit_work(self, self.ames[node].clone(), data, lamellae);
-    }
-
-    fn submit_task<F>(&self, future: F)
-    where
-        F: Future<Output = ()>,
-    {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].submit_task(future);
-    }
-
-    fn exec_task(&self) {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].exec_task();
-    }
-
-    fn submit_task_node<F>(&self, future: F, node: usize)
-    where
-        F: Future<Output = ()>,
-    {
-        self.inners[node].submit_task(future);
-    }
-
-    fn block_on<F>(&self, future: F) -> F::Output
-    where
-        F: Future,
-    {
-        let node =
-            CUR_NODE.with(|cur_node| cur_node.fetch_add(1, Ordering::Relaxed) & self.node_mask);
-        self.inners[node].block_on(future)
-    }
-
-    fn shutdown(&self) {
-        for inner in self.inners.iter() {
-            inner.shutdown();
-        }
-    }
-    fn active(&self) -> bool {
-        for inner in self.inners.iter() {
-            if inner.active() {
-                return true;
-            }
-        }
-        return false;
-    }
-}
-
-impl NumaWorkStealing2Inner {
-    pub(crate) fn new(
-        stall_mark: Arc<AtomicUsize>,
-        core_ids: Vec<CoreId>,
-    ) -> NumaWorkStealing2Inner {
-        // println!("new work stealing queue");
-
-        let mut sched = NumaWorkStealing2Inner {
-            threads: Vec::new(),
-            work_inj: Arc::new(crossbeam::deque::Injector::new()),
-            work_stealers: Vec::new(),
-            work_flag: Arc::new(AtomicU8::new(0)),
-            active: Arc::new(AtomicBool::new(true)),
-            active_cnt: Arc::new(AtomicUsize::new(0)),
-            num_tasks: Arc::new(AtomicUsize::new(0)),
-            stall_mark: stall_mark,
-        };
-        sched.init(core_ids);
-        sched
-    }
-
-    fn init(&mut self, core_ids: Vec<CoreId>) {
-        let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<async_task::Runnable>> =
-            vec![];
-        // let num_workers = match std::env::var("LAMELLAR_THREADS") {
-        //     Ok(n) => n.parse::<usize>().unwrap(),
-        //     Err(_) => 4,
-        // };
-        for _i in 0..core_ids.len() {
-            let work_worker: crossbeam::deque::Worker<async_task::Runnable> =
-                crossbeam::deque::Worker::new_fifo();
-            self.work_stealers.push(work_worker.stealer());
-            work_workers.push(work_worker);
-        }
-
-        let orig_hook = panic::take_hook();
-        panic::set_hook(Box::new(move |panic_info| {
-            // invoke the default handler and exit the process
-            orig_hook(panic_info);
-            process::exit(1);
-        }));
-        // let core_ids = core_affinity::get_core_ids().unwrap();
-        // println!("core_ids: {:?}",core_ids);
-        for i in 0..core_ids.len() {
-            let work_worker = work_workers.pop().unwrap();
-            let worker = NumaWorkStealing2Thread {
-                work_inj: self.work_inj.clone(),
-                work_stealers: self.work_stealers.clone(),
-                work_q: work_worker,
-                work_flag: self.work_flag.clone(),
-                active: self.active.clone(),
-                // num_tasks: self.num_tasks.clone(),
-            };
-            self.threads.push(NumaWorkStealing2Thread::run(
-                worker,
-                self.active_cnt.clone(),
-                self.num_tasks.clone(),
-                core_ids[i % core_ids.len()],
-            ));
-        }
-        while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
-            std::thread::yield_now();
-        }
-    }
-}
-
-thread_local! {
-    static CUR_NODE: AtomicUsize = AtomicUsize::new(0);
-}
-
-#[derive(Debug)]
-pub(crate) struct NumaWorkStealing2 {
-    inners: Vec<&(impl SchedulerQueue + Sync + std::fmt::Debug)>,
-    ames: Vec<Arc<ActiveMessageEngineType>>,
-    node_mask: usize,
-}
-impl NumaWorkStealing2 {
-    pub(crate) fn new(
-        num_pes: usize,
-        // my_pe: usize,
-        // teams: Arc<RwLock<HashMap<u64, Weak<LamellarTeamRT>>>>,
-    ) -> NumaWorkStealing2 {
-        // println!("new work stealing queue");
-
-        let num_workers = match std::env::var("LAMELLAR_THREADS") {
-            Ok(n) => n.parse::<usize>().unwrap(),
-            Err(_) => 4,
-        };
-        let core_ids = core_affinity::get_core_ids().unwrap();
-        println!("core_ids: {:?}", core_ids);
-        let mut node_to_cores: HashMap<usize, Vec<usize>> = HashMap::new();
-        let mut core_to_node: HashMap<usize, usize> = HashMap::new();
-
-        let mut cur_worker_cnt = 0;
-
-        if let Ok(nodes) = glob::glob("/sys/devices/system/node/node*") {
-            for node in nodes {
-                if let Ok(node_path) = node {
-                    if let Some(node) = format!("{}", node_path.display()).split("/").last() {
-                        if let Some(node) = node.strip_prefix("node") {
-                            if let Ok(node) = node.parse::<usize>() {
-                                if let Ok(cpus) =
-                                    glob::glob(&format!("{}/cpu*", node_path.display()))
-                                {
-                                    let mut cores = Vec::new();
-                                    for cpu in cpus {
-                                        if let Ok(cpu) = cpu {
-                                            if let Some(cpu) =
-                                                format!("{}", cpu.display()).split("/").last()
-                                            {
-                                                if let Some(cpu) = cpu.strip_prefix("cpu") {
-                                                    if let Ok(cpu) = cpu.parse::<usize>() {
-                                                        for core_id in core_ids.iter() {
-                                                            if core_id.id == cpu {
-                                                                core_to_node.insert(cpu, node);
-                                                                cores.push(cpu);
-                                                                cur_worker_cnt += 1;
-                                                            }
-                                                            if cur_worker_cnt >= num_workers {
-                                                                break;
-                                                            }
-                                                        }
-                                                    }
-                                                }
-                                            }
-                                        }
-                                    }
-                                    if cores.len() > 0 {
-                                        node_to_cores.insert(node, cores);
-                                    }
-                                    if cur_worker_cnt >= num_workers {
-                                        break;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        println!("node_to_cores {:?}", node_to_cores);
-        println!("core_to_node {:?}", core_to_node);
-
-        let mut inners = vec![];
-        let mut ames = vec![];
-
-        let mut node_mask = node_to_cores.len() - 1;
-        node_mask |= node_mask >> 1;
-        node_mask |= node_mask >> 2;
-        node_mask |= node_mask >> 4;
-        node_mask |= node_mask >> 8;
-        node_mask |= node_mask >> 16;
-        node_mask |= node_mask >> 32;
-
-        // let mut node_i = 0;
-        let stall_mark = Arc::new(AtomicUsize::new(0));
-        for (_node, cores) in node_to_cores.iter() {
-            let mut core_ids = vec![];
-            for core in cores {
-                core_ids.push(CoreId { id: *core });
-            }
-            let inner = Arc::new(AmeScheduler::NumaWorkStealing2Inner(
-                NumaWorkStealing2Inner::new(stall_mark.clone(), core_ids),
-            ));
-            let batcher = match std::env::var("LAMELLAR_BATCHER") {
-                Ok(n) => {
-                    let n = n.parse::<usize>().unwrap();
-                    if n == 1 {
-                        BatcherType::Simple(SimpleBatcher::new(num_pes, stall_mark.clone()))
-                    } else {
-                        BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone()))
-                    }
-                }
-                Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, stall_mark.clone())),
-            };
-            ames.push(Arc::new(ActiveMessageEngineType::RegisteredActiveMessages(
-                RegisteredActiveMessages::new(batcher),
-            )));
-            inners.push(inner);
-            // node_i += 1;
-        }
-
-        println!("numa node mask: {:x}", node_mask);
-
-        let sched = NumaWorkStealing2 {
-            inners: inners,
-            ames: ames,
-            node_mask: node_mask,
-        };
-        sched
-    }
-}
-
-impl Drop for NumaWorkStealing2Inner {
-    //when is this called with respect to world?
-    fn drop(&mut self) {
-        // println!("dropping work stealing");
-        while let Some(thread) = self.threads.pop() {
-            if thread.thread().id() != std::thread::current().id() {
-                let _res = thread.join();
-            }
-        }
-        // for val in self.local_work_inj.iter_mut() {
-        //     println!("local_work_inj {:?}", val.load(Ordering::SeqCst));
-        // }
-        // println!("NumaWorkStealing2 Scheduler Dropped");
-    }
-}
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index f9e14ac1..becd7611 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -1,22 +1,10 @@
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+use crate::scheduler::LamellarExecutor;
 
 use tokio::runtime::Runtime;
 
 use tracing::*;
 
-use async_task::{Builder, Runnable};
-use core_affinity::CoreId;
-use crossbeam::deque::Worker;
 use futures::Future;
-use futures_lite::FutureExt;
-use rand::prelude::*;
-use std::panic;
-use std::process;
-use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
-use std::thread;
-
-static TASK_ID: AtomicUsize = AtomicUsize::new(0);
 
 #[derive(Debug)]
 pub(crate) struct TokioRt {
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 44ae69ab..9f73175a 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -436,7 +436,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
-            array.batch_add(&input_array.blocking_read_local_data(),1);
+            let _ = array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -445,7 +445,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
-            array.batch_add(&input_array.blocking_read_local_data(),1);
+            let _ = array.batch_add(&input_array.blocking_read_local_data(),1);
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }

From 4da5d84db13b3c05331dbc72d0928e2e9df61fe2 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 28 Feb 2024 13:54:38 -0800
Subject: [PATCH 014/116] fixing some issues from the rebase

---
 run_examples.sh                               |   1 +
 .../registered_active_message.rs              |  10 ++
 src/array/iterator/distributed_iterator.rs    |   3 +-
 src/array/local_lock_atomic/local_chunks.rs   |  50 ++++++---
 src/array/operations.rs                       | 101 +++++++++---------
 src/array/read_only/local_chunks.rs           |  15 ++-
 src/scheduler/tokio_executor.rs               |  20 ++--
 src/scheduler/work_stealing.rs                |  90 ++++++++--------
 8 files changed, 161 insertions(+), 129 deletions(-)

diff --git a/run_examples.sh b/run_examples.sh
index 9f96ee4f..5c53dfeb 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -114,6 +114,7 @@ for toolchain in stable; do #nightly; do
         #   done
         fi
       cd ..
+      sleep 5
       cur_tasks=`squeue -u frie869 | grep " R " | wc -l`
       while [ $cur_tasks -gt 3 ]; do
         cur_tasks=`squeue -u frie869 | grep " R " | wc -l`
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index fcc017e3..994969f6 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -312,8 +312,18 @@ impl RegisteredActiveMessages {
     // #[tracing::instrument(skip_all)]
     async fn send_data_am(&self, req_data: ReqMetaData, data: LamellarResultArc, data_size: usize) {
         // println!("send_data_am");
+        let header = self.create_header(&req_data, Cmd::Data);
+        let mut darcs = vec![];
+        data.ser(1, &mut darcs); //1 because we are only sending back to the original PE
+        let darc_list_size = crate::serialized_size(&darcs, false);
+        let data_header = DataHeader {
+            size: data_size,
+            req_id: req_data.id,
+            darc_list_size: darc_list_size,
+        };
 
         let data_buf = self
+            .create_data_buf(
                 header,
                 data_size + darc_list_size + *DATA_HEADER_LEN,
                 &req_data.lamellae,
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index adad0083..5c1cdd39 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -36,8 +36,7 @@ use take::*;
 
 pub(crate) use consumer::*;
 
-use crate::array::iterator::one_sided_iterator::OneSidedIterator;
-use crate::array::iterator::{private::*, IterRequest, Schedule};
+use crate::array::iterator::{private::*, Schedule};
 use crate::array::{
     operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray,
     LamellarArray, NativeAtomicArray,
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index 1f5d41d1..81dd9202 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -13,7 +13,7 @@ pub struct LocalLockLocalChunks<T: Dist> {
     end_index: usize, //global index within the array local data
     array: LocalLockArray<T>,
     lock: LocalRwDarc<()>,
-    lock_guard: Arc<RwLockReadGuardArc<Box<()>>>,
+    lock_guard: Arc<RwLockReadGuardArc<()>>,
 }
 
 impl<T: Dist> IterClone for LocalLockLocalChunks<T> {
@@ -36,7 +36,7 @@ pub struct LocalLockLocalChunksMut<T: Dist> {
     end_index: usize, //global index within the array local data
     array: LocalLockArray<T>,
     lock: LocalRwDarc<()>,
-    lock_guard: Arc<RwLockWriteGuardArc<Box<()>>>,
+    lock_guard: Arc<RwLockWriteGuardArc<()>>,
 }
 
 impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
@@ -56,7 +56,7 @@ impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
 pub struct LocalLockMutChunkLocalData<'a, T: Dist> {
     data: &'a mut [T],
     _index: usize,
-    _lock_guard: Arc<RwLockWriteGuardArc<Box<()>>>,
+    _lock_guard: Arc<RwLockWriteGuardArc<()>>,
 }
 
 impl<T: Dist> Deref for LocalLockMutChunkLocalData<'_, T> {
@@ -71,8 +71,8 @@ impl<T: Dist> DerefMut for LocalLockMutChunkLocalData<'_, T> {
     }
 }
 
-impl<T: Dist + 'static> LocalIterator for LocalLockLocalChunks<T> {
-    type Item = LocalLockLocalData<'static, T>;
+impl<T: Dist> LocalIterator for LocalLockLocalChunks<T> {
+    type Item = LocalLockLocalData<T>;
     type Array = LocalLockArray<T>;
     fn init(&self, start_i: usize, cnt: usize) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
@@ -108,14 +108,7 @@ impl<T: Dist + 'static> LocalIterator for LocalLockLocalChunks<T> {
             //     start_i, end_i, self.index, self.end_index
             // );
             Some(LocalLockLocalData {
-                array: self.array.clone(),
-                data: unsafe {
-                    std::slice::from_raw_parts_mut(
-                        self.array.array.local_as_mut_ptr().offset(start_i as isize),
-                        end_i - start_i,
-                    )
-                },
-                index: 0,
+                array: self.array.sub_array(start_i..end_i),
                 lock: self.lock.clone(),
                 lock_guard: self.lock_guard.clone(),
             })
@@ -132,7 +125,7 @@ impl<T: Dist + 'static> LocalIterator for LocalLockLocalChunks<T> {
     }
 }
 
-impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalChunks<T> {
+impl<T: Dist> IndexedLocalIterator for LocalLockLocalChunks<T> {
     fn iterator_index(&self, index: usize) -> Option<usize> {
         if index * self.chunk_size < self.array.len() {
             Some(index) //everyone at this point as calculated the actual index (cause we are local only) so just return it
@@ -181,6 +174,7 @@ impl<T: Dist + 'static> LocalIterator for LocalLockLocalChunksMut<T> {
             //     start_i, end_i, self.index, self.end_index
             // );
             Some(LocalLockMutChunkLocalData {
+                //TODO we can probably do this similar to non mut way to avoid the unsafe...
                 data: unsafe {
                     std::slice::from_raw_parts_mut(
                         self.array.array.local_as_mut_ptr().offset(start_i as isize),
@@ -215,7 +209,19 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalChunksMut<T> {
 }
 
 impl<T: Dist> LocalLockArray<T> {
-    pub fn read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
+    pub async fn read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
+        let lock = Arc::new(self.lock.read().await);
+        LocalLockLocalChunks {
+            chunk_size,
+            index: 0,
+            end_index: 0,
+            array: self.clone(),
+            lock: self.lock.clone(),
+            lock_guard: lock,
+        }
+    }
+
+    pub fn blocking_read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
         let lock = Arc::new(self.array.block_on(self.lock.read()));
         LocalLockLocalChunks {
             chunk_size,
@@ -227,7 +233,19 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
-    pub fn write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
+    pub async fn write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
+        let lock = Arc::new(self.lock.write().await);
+        LocalLockLocalChunksMut {
+            chunk_size,
+            index: 0,
+            end_index: 0,
+            array: self.clone(),
+            lock: self.lock.clone(),
+            lock_guard: lock,
+        }
+    }
+
+    pub fn blocking_write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
         let lock = Arc::new(self.array.block_on(self.lock.write()));
         LocalLockLocalChunksMut {
             chunk_size,
diff --git a/src/array/operations.rs b/src/array/operations.rs
index a16f7bf1..da6df929 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -245,28 +245,28 @@ pub enum OpInputEnum<'a, T: Dist> {
 }
 
 impl<'a, T: Dist> OpInputEnum<'a, T> {
-    #[tracing::instrument(skip_all)]
-    pub(crate) fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
-        match self {
-            OpInputEnum::Val(v) => Box::new(std::iter::repeat(v).map(|elem| *elem)),
-            OpInputEnum::Slice(s) => Box::new(s.iter().map(|elem| *elem)),
-            OpInputEnum::Vec(v) => Box::new(v.iter().map(|elem| *elem)),
-            OpInputEnum::NativeAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
-            OpInputEnum::GenericAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
-            OpInputEnum::LocalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
-            OpInputEnum::GlobalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
-            // OpInputEnum::MemoryRegion(mr) => Box::new(
-            //     unsafe { mr.as_slice() }
-            //         .expect("memregion not local")
-            //         .iter()
-            //         .map(|elem| *elem),
-            // ),
-            // OpInputEnum::UnsafeArray(a) => Box::new(unsafe{a.local_data()}.iter().map(|elem| *elem)),
-            // OpInputEnum::ReadOnlyArray(a) => Box::new(a.local_data().iter().map(|elem| *elem)),
-            // OpInputEnum::AtomicArray(a) => Box::new(a.local_data().iter().map(|elem| elem.load())),
-        }
-    }
-    #[tracing::instrument(skip_all)]
+    // #[tracing::instrument(skip_all)]
+    // pub(crate) fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
+    //     match self {
+    //         OpInputEnum::Val(v) => Box::new(std::iter::repeat(v).map(|elem| *elem)),
+    //         OpInputEnum::Slice(s) => Box::new(s.iter().map(|elem| *elem)),
+    //         OpInputEnum::Vec(v) => Box::new(v.iter().map(|elem| *elem)),
+    //         OpInputEnum::NativeAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
+    //         OpInputEnum::GenericAtomicLocalData(a) => Box::new(a.iter().map(|elem| elem.load())),
+    //         OpInputEnum::LocalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
+    //         OpInputEnum::GlobalLockLocalData(a) => Box::new(a.iter().map(|elem| *elem)),
+    //         // OpInputEnum::MemoryRegion(mr) => Box::new(
+    //         //     unsafe { mr.as_slice() }
+    //         //         .expect("memregion not local")
+    //         //         .iter()
+    //         //         .map(|elem| *elem),
+    //         // ),
+    //         // OpInputEnum::UnsafeArray(a) => Box::new(unsafe{a.local_data()}.iter().map(|elem| *elem)),
+    //         // OpInputEnum::ReadOnlyArray(a) => Box::new(a.local_data().iter().map(|elem| *elem)),
+    //         // OpInputEnum::AtomicArray(a) => Box::new(a.local_data().iter().map(|elem| elem.load())),
+    //     }
+    // }
+    // #[tracing::instrument(skip_all)]
     pub(crate) fn len(&self) -> usize {
         match self {
             OpInputEnum::Val(_) => 1,
@@ -305,47 +305,52 @@ impl<'a, T: Dist> OpInputEnum<'a, T> {
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn into_vec_chunks(self, chunk_size: usize) -> Vec<Vec<T>> {
         match self {
-            OpInputEnum::Val(v) =>vec![vec![v]],
+            OpInputEnum::Val(v) => vec![vec![v]],
             OpInputEnum::Slice(s) => s.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
             OpInputEnum::Vec(v) => v.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect(),
             OpInputEnum::NativeAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                a.iter().enumerate().filter_map(move |(i, elem)| {
-                    data.push(elem.load());
-                    if data.len() == chunk_size || i == a.len() - 1 {
-                        let mut new_data = Vec::with_capacity(chunk_size);
-                        std::mem::swap(&mut data, &mut new_data);
-                        Some(new_data)
-                    } else {
-                        None
-                    }
-                }).collect()
+                a.iter()
+                    .enumerate()
+                    .filter_map(move |(i, elem)| {
+                        data.push(elem.load());
+                        if data.len() == chunk_size || i == a.len() - 1 {
+                            let mut new_data = Vec::with_capacity(chunk_size);
+                            std::mem::swap(&mut data, &mut new_data);
+                            Some(new_data)
+                        } else {
+                            None
+                        }
+                    })
+                    .collect()
             }
             OpInputEnum::GenericAtomicLocalData(a) => {
                 let mut data = Vec::with_capacity(chunk_size);
 
-                a.iter().enumerate().filter_map(move |(i, elem)| {
-                    data.push(elem.load());
-                    if data.len() == chunk_size || i == a.len() - 1 {
-                        let mut new_data = Vec::with_capacity(chunk_size);
-                        std::mem::swap(&mut data, &mut new_data);
-                        Some(new_data)
-                    } else {
-                        None
-                    }
-                }).collect()
+                a.iter()
+                    .enumerate()
+                    .filter_map(move |(i, elem)| {
+                        data.push(elem.load());
+                        if data.len() == chunk_size || i == a.len() - 1 {
+                            let mut new_data = Vec::with_capacity(chunk_size);
+                            std::mem::swap(&mut data, &mut new_data);
+                            Some(new_data)
+                        } else {
+                            None
+                        }
+                    })
+                    .collect()
             }
             OpInputEnum::LocalLockLocalData(a) => {
                 a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
             }
             OpInputEnum::GlobalLockLocalData(a) => {
                 a.chunks(chunk_size).map(|chunk| chunk.to_vec()).collect()
-            }
-            // OpInputEnum::MemoryRegion(mr) => *unsafe { mr.as_slice() }
-            //     .expect("memregion not local")
-            //     .first()
-            //     .expect("memregion is empty"),
+            } // OpInputEnum::MemoryRegion(mr) => *unsafe { mr.as_slice() }
+              //     .expect("memregion not local")
+              //     .first()
+              //     .expect("memregion is empty"),
         }
     }
 
diff --git a/src/array/read_only/local_chunks.rs b/src/array/read_only/local_chunks.rs
index b809c348..a1cb7444 100644
--- a/src/array/read_only/local_chunks.rs
+++ b/src/array/read_only/local_chunks.rs
@@ -4,7 +4,6 @@ use crate::array::read_only::*;
 use crate::array::LamellarArray;
 use crate::memregion::Dist;
 
-
 #[derive(Clone)]
 pub struct ReadOnlyLocalChunks<T: Dist> {
     chunk_size: usize,
@@ -24,10 +23,8 @@ impl<T: Dist> IterClone for ReadOnlyLocalChunks<T> {
     }
 }
 
-
-
 impl<T: Dist + 'static> LocalIterator for ReadOnlyLocalChunks<T> {
-    type Item =  &'static [T];
+    type Item = &'static [T];
     type Array = ReadOnlyArray<T>;
     fn init(&self, start_i: usize, cnt: usize) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
@@ -60,10 +57,12 @@ impl<T: Dist + 'static> LocalIterator for ReadOnlyLocalChunks<T> {
             //     "start_i {} end_i {} self.index {} self.end_index {}",
             //     start_i, end_i, self.index, self.end_index
             // );
-            Some(unsafe{std::slice::from_raw_parts_mut(
-                self.array.array.local_as_mut_ptr().offset(start_i as isize),
-                end_i - start_i,
-            )})
+            Some(unsafe {
+                std::slice::from_raw_parts_mut(
+                    self.array.array.local_as_mut_ptr().offset(start_i as isize),
+                    end_i - start_i,
+                )
+            })
         } else {
             None
         }
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index becd7611..814e57db 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -2,8 +2,6 @@ use crate::scheduler::LamellarExecutor;
 
 use tokio::runtime::Runtime;
 
-use tracing::*;
-
 use futures::Future;
 
 #[derive(Debug)]
@@ -18,9 +16,9 @@ impl LamellarExecutor for TokioRt {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        trace_span!("submit_task").in_scope(|| {
+        // trace_span!("submit_task").in_scope(|| {
             self.rt.spawn(async move { task.await });
-        });
+        // });
     }
 
     fn submit_immediate_task<F>(&self, task: F)
@@ -28,26 +26,28 @@ impl LamellarExecutor for TokioRt {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        trace_span!("submit_task").in_scope(|| {
+        // trace_span!("submit_task").in_scope(|| {
             self.rt.spawn(async move { task.await });
-        });
+        // });
     }
 
     fn block_on<F: Future>(&self, task: F) -> F::Output {
-        trace_span!("block_on").in_scope(|| self.rt.block_on(task))
+        // trace_span!("block_on").in_scope(|| 
+            self.rt.block_on(task)
+        // )
     }
 
-    #[tracing::instrument(skip_all)]
+    // #[tracing::instrument(skip_all)]
     fn shutdown(&self) {
         // i think we just let tokio do this on drop
     }
 
-    #[tracing::instrument(skip_all)]
+    // #[tracing::instrument(skip_all)]
     fn force_shutdown(&self) {
         // i think we just let tokio do this on drop
     }
 
-    #[tracing::instrument(skip_all)]
+    // #[tracing::instrument(skip_all)]
     fn exec_task(&self) {
         // I dont think tokio has a way to do this
     }
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index c61596d9..9dbf1204 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -38,7 +38,7 @@ impl WorkStealingThread {
         builder
             .spawn(move || {
                 // println!("TestSchdulerWorker thread running {:?} core: {:?}", std::thread::current().id(), id);
-                let _span = trace_span!("WorkStealingThread::run");
+                // let _span = trace_span!("WorkStealingThread::run");
                 core_affinity::set_for_current(id);
                 active_cnt.fetch_add(1, Ordering::SeqCst);
                 let mut rng = rand::thread_rng();
@@ -131,16 +131,16 @@ impl LamellarExecutor for WorkStealing {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        trace_span!("submit_task").in_scope(|| {
-            let work_inj = self.work_inj.clone();
-            let schedule = move |runnable| work_inj.push(runnable);
-            let (runnable, task) = Builder::new()
-                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn(move |_task_id| async move { task.await }, schedule);
+        // trace_span!("submit_task").in_scope(|| {
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
 
-            runnable.schedule();
-            task.detach();
-        });
+        runnable.schedule();
+        task.detach();
+        // });
     }
 
     fn submit_immediate_task<F>(&self, task: F)
@@ -148,44 +148,44 @@ impl LamellarExecutor for WorkStealing {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        trace_span!("submit_task").in_scope(|| {
-            let imm_inj = self.imm_inj.clone();
-            let schedule = move |runnable| imm_inj.push(runnable);
-            let (runnable, task) = Builder::new()
-                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn(move |_task_id| async move { task.await }, schedule);
+        // trace_span!("submit_task").in_scope(|| {
+        let imm_inj = self.imm_inj.clone();
+        let schedule = move |runnable| imm_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
 
-            runnable.schedule(); //try to run immediately
-            task.detach();
-        });
+        runnable.schedule(); //try to run immediately
+        task.detach();
+        // });
     }
 
     fn block_on<F: Future>(&self, task: F) -> F::Output {
-        trace_span!("block_on").in_scope(|| {
-            let work_inj = self.work_inj.clone();
-            let schedule = move |runnable| work_inj.push(runnable);
-            let (runnable, mut task) = unsafe {
-                Builder::new()
-                    .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                    .spawn_unchecked(move |_task_id| async move { task.await }, schedule)
-            };
-            let waker = runnable.waker();
-            runnable.run(); //try to run immediately
-            while !task.is_finished() {
-                self.exec_task(); //try to execute another task while this one is not ready
-            }
-            let cx = &mut async_std::task::Context::from_waker(&waker);
-            if let async_std::task::Poll::Ready(output) = task.poll(cx) {
-                output
-            } else {
-                println!(
-                    "[{:?}] work stealing block on failed --  task id{:?}",
-                    std::thread::current().id(),
-                    task.metadata()
-                );
-                panic!("task not ready");
-            }
-        })
+        // trace_span!("block_on").in_scope(|| {
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, mut task) = unsafe {
+            Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn_unchecked(move |_task_id| async move { task.await }, schedule)
+        };
+        let waker = runnable.waker();
+        runnable.run(); //try to run immediately
+        while !task.is_finished() {
+            self.exec_task(); //try to execute another task while this one is not ready
+        }
+        let cx = &mut async_std::task::Context::from_waker(&waker);
+        if let async_std::task::Poll::Ready(output) = task.poll(cx) {
+            output
+        } else {
+            println!(
+                "[{:?}] work stealing block on failed --  task id{:?}",
+                std::thread::current().id(),
+                task.metadata()
+            );
+            panic!("task not ready");
+        }
+        // })
     }
 
     //#[tracing::instrument(skip_all)]
@@ -275,7 +275,7 @@ impl WorkStealing {
         ws.init();
         ws
     }
-    #[tracing::instrument(skip_all)]
+    // #[tracing::instrument(skip_all)]
     fn init(&mut self) {
         let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<Runnable<usize>>> = vec![];
         for _i in 0..self.max_num_threads {

From 40f97a835e0cc6d64964db55918dd037eedc929e Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 6 Mar 2024 10:11:40 -0800
Subject: [PATCH 015/116] implementing into_stream for onesided_iter

---
 Cargo.toml                                    |   5 +-
 examples/array_examples/array_put_get.rs      |  50 +-
 examples/darc_examples/string_darc.rs         |  18 +-
 .../kernels/parallel_blocked_array_gemm.rs    | 188 ++--
 examples/team_examples/random_team.rs         |   4 +-
 impl/src/array_ops.rs                         |   3 -
 run_examples.sh                               |  12 +-
 src/active_messaging.rs                       |  10 +-
 src/active_messaging/batching.rs              |   4 +-
 .../batching/simple_batcher.rs                | 156 ++--
 .../batching/team_am_batcher.rs               | 223 ++---
 .../registered_active_message.rs              |  85 +-
 src/array.rs                                  |  57 +-
 src/array/generic_atomic.rs                   |   5 +
 src/array/global_lock_atomic.rs               |  10 +
 .../distributed_iterator/consumer/count.rs    |  19 +-
 .../distributed_iterator/consumer/reduce.rs   |  21 +-
 .../distributed_iterator/consumer/sum.rs      |  15 +-
 src/array/iterator/one_sided_iterator.rs      | 422 +++++++--
 .../iterator/one_sided_iterator/buffered.rs   |  12 +
 .../iterator/one_sided_iterator/chunks.rs     | 127 ++-
 src/array/iterator/one_sided_iterator/skip.rs |  22 +-
 .../iterator/one_sided_iterator/step_by.rs    |  33 +-
 src/array/iterator/one_sided_iterator/zip.rs  | 115 ++-
 src/array/local_lock_atomic.rs                |  10 +
 src/array/native_atomic.rs                    |   4 +
 src/array/operations.rs                       | 869 +++++++++---------
 src/array/read_only.rs                        |  72 +-
 src/array/unsafe.rs                           |  10 +-
 src/array/unsafe/rdma.rs                      |  22 +-
 src/barrier.rs                                |   4 +-
 src/lamellar_request.rs                       |  36 +-
 src/lamellar_task_group.rs                    |  37 +
 src/lamellar_team.rs                          |  21 +-
 src/lamellar_world.rs                         |   4 +
 src/scheduler.rs                              |  48 +-
 src/scheduler/work_stealing.rs                |   2 +-
 37 files changed, 1783 insertions(+), 972 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index fd38e93d..f75afd30 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,10 +44,7 @@ newtype_derive = "0.1.6"
 custom_derive = "0.1.7"
 glob = "0.3.0"
 thread_local = "1.1.4"
-#tracing = "0.1.37"
-#tracing-futures = "0.2.5"
-#tracing-flame = "0.2.0"
-pin-project = "1.0.12"
+pin-project = "1.1.4"
 #enum-as-inner = "0.5.1"
 #itertools = "0.10.5"
 serde_with = "3.0.0"
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index 9d1463f1..2160a0cd 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -37,78 +37,78 @@ fn main() {
         initialize_mem_region(&shared_mem_region);
         initialize_mem_region(&local_mem_region);
         println!("data initialized");
-        world.barrier();
+        world.async_barrier().await;
 
         // puts/gets with memregions
         unsafe {
             block_array.print();
-            world.barrier();
+            world.async_barrier().await;
             println!("PE{my_pe}, smr {:?}", shared_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             let start = std::time::Instant::now();
             if my_pe == 0 {
                 block_array.put(0, &shared_mem_region).await
             }; //uses the local data of the shared memregion
-            world.barrier();
+            world.async_barrier().await;
             block_array.print();
-            world.barrier();
+            world.async_barrier().await;
             println!("PE{my_pe}, smr {:?}", shared_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             println!("PE{my_pe}, lmr {:?}", local_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             if my_pe == 0 {
                 block_array.put(0, &local_mem_region).await
             };
-            world.barrier();
+            world.async_barrier().await;
             block_array.print();
             println!("PE{my_pe}, lmr {:?}", local_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
 
             cyclic_array.print();
-            world.barrier();
+            world.async_barrier().await;
             if my_pe == 0 {
                 cyclic_array.put(0, &shared_mem_region).await
             };
-            world.barrier();
+            world.async_barrier().await;
             cyclic_array.print();
-            world.barrier();
+            world.async_barrier().await;
             println!("PE{my_pe}, smr {:?}", shared_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             println!("PE{my_pe}, lmr {:?}", local_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             if my_pe == 0 {
                 cyclic_array.put(0, &local_mem_region).await
             };
-            world.barrier();
+            world.async_barrier().await;
             cyclic_array.print();
             println!("put elapsed {:?}", start.elapsed().as_secs_f64());
-            world.barrier();
+            world.async_barrier().await;
 
             initialize_array(&block_array);
             initialize_array(&cyclic_array);
             // can use subregions
 
             block_array.print();
-            world.barrier();
+            world.async_barrier().await;
             println!("PE{my_pe}, smr {:?}", shared_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             let start = std::time::Instant::now();
             block_array.print();
-            world.barrier();
+            world.async_barrier().await;
             println!("PE{my_pe}, smr {:?}", shared_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             if my_pe == 0 {
                 block_array.get_unchecked(0, shared_mem_region.sub_region(0..total_len / 2))
             }; //uses local data of the shared memregion
             println!("PE{my_pe}, lmr {:?}", local_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             if my_pe == 0 {
                 block_array.get_unchecked(0, local_mem_region.sub_region(0..total_len / 2))
             };
-            world.barrier();
+            world.async_barrier().await;
             block_array.print();
             println!("PE{my_pe}, lmr {:?}", local_mem_region.as_slice());
-            world.barrier();
+            world.async_barrier().await;
             println!("get_unchecked elapsed {:?}", start.elapsed().as_secs_f64());
         }
         let start = std::time::Instant::now();
@@ -123,13 +123,13 @@ fn main() {
         }
 
         println!("get elapsed {:?}", start.elapsed().as_secs_f64());
-        world.barrier();
+        world.async_barrier().await;
         // puts/gets using single values
         unsafe {
             block_array.put(total_len - 1, &12345).await;
             cyclic_array.put(total_len - 1, &12345).await;
         }
-        world.barrier();
+        world.async_barrier().await;
     });
 
     // in the future will be able to use and input/output :
diff --git a/examples/darc_examples/string_darc.rs b/examples/darc_examples/string_darc.rs
index 37bf7cbb..0092128f 100644
--- a/examples/darc_examples/string_darc.rs
+++ b/examples/darc_examples/string_darc.rs
@@ -26,15 +26,17 @@ fn main() {
         println!("[PE: {}] {}", my_pe, string_data.read().await);
 
         if my_pe == 0 {
-            world.block_on(world.exec_am_pe(
-                1,
-                StringDarcAm {
-                    new_data: String::from("Modified string from 0"),
-                    data: string_data.clone(),
-                },
-            ));
+            world
+                .exec_am_pe(
+                    1,
+                    StringDarcAm {
+                        new_data: String::from("Modified string from 0"),
+                        data: string_data.clone(),
+                    },
+                )
+                .await;
         }
-        world.barrier();
+        world.async_barrier().await;
         println!("[PE: {}] {}", my_pe, string_data.read().await);
     });
 }
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index 53dc9aa0..416950ce 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -1,3 +1,4 @@
+use futures::stream::StreamExt;
 use lamellar::array::prelude::*;
 /// ----------------Lamellar Parallel Blocked Array GEMM---------------------------------------------------
 /// This performs a distributed GEMM by partitioning the global matrices (stored in LamellarArrya)
@@ -86,102 +87,109 @@ fn main() {
     let nblks_array = nblks_array.into_read_only();
 
     let start = std::time::Instant::now();
-    let a = a.clone();
-    let b = b.clone();
+    let a_clone = a.clone();
+    let b_clone = b.clone();
     let c_clone = c.clone();
-    let _ = nblks_array.dist_iter().for_each(move |block| {
-        //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
-        // for j_blk in 0..p_blks {
-        // iterate over submatrix rows of b
-        let j_blk = block.j;
-        let k_blk = block.k;
-        // println!("j_blk: {}, k_blk: {}", j_blk, k_blk);
-        let b_block = b
-            .onesided_iter() // OneSidedIterator (each pe will iterate through entirety of b)
-            .chunks(blocksize) //chunks columns by blocksize  -- manages efficent transfer and placement of data into a local memory region
-            .skip(k_blk * n_blks * blocksize + j_blk) // skip previously transfered submatrices
-            .step_by(n_blks) //grab chunk from next column in submatrix
-            // .buffered(100)
-            .into_iter() // convert to normal rust iterator
-            .take(blocksize) // we only need to take blocksize columns
-            .collect::<Vec<_>>(); //gather local memory regions containing each columns data
-
-        //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
-        let mut b_block_vec = vec![0.0; blocksize * blocksize];
-        for (j, col) in b_block.iter().enumerate() {
-            //(index, LocalMemRegion)
-            let b_block_col = &mut b_block_vec[j * blocksize..(j + 1) * blocksize];
-            b_block_col.copy_from_slice(unsafe { col.as_slice().unwrap() });
-        }
-        let b_block_vec = Arc::new(b_block_vec); //we will be sharing this submatrix in multiple tasks
-                                                 //--------------
-
-        for i_blk in 0..m_blks_pe {
-            // iterate of the local submatrix rows of a
-            let c = c_clone.clone();
-            let b_block_vec = b_block_vec.clone();
-            let a_vec = a
-                .local_as_slice()
-                .chunks(blocksize)
-                .skip(i_blk * m_blks * blocksize + k_blk) //skip previously visited submatrices
-                .step_by(m_blks) //grab chunk from the next row in submatrix
-                .take(blocksize) //we only need to take blocksize rows
-                .flatten()
-                .copied() //get values instead of references
-                .collect::<Vec<f32>>();
-            // a.dist_iter() //DistributedIterator (each pe will iterate through only its local data -- in parallel)
-            //     .chunks(blocksize) //chunks rows by blocksize
-            //     .skip(i_blk * m_blks * blocksize + k_blk) //skip previously visited submatrices
-            //     .step_by(m_blks) //grab chunk from the next row in submatrix
-            //     .take(blocksize) //we only need to take blocksize rows
-            //     .chunks(blocksize) //currently a "hack" for Iterate::collect()
-            //     .for_each(move |a_block| {
-            //         //iterate over local submatrices is submatrix row "i_blk"
-            //         //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
-            //         let mut a_vec = vec![0.0; blocksize * blocksize];
-            //         for (i, row) in a_block.enumerate() {
-            //             for (j, elem) in row.enumerate() {
-            //                 a_vec[i * blocksize + j] = *elem;
-            //             }
-            //         }
-            // println!("a_vec: {:?}", a_vec);
-            // -------------------------------
-            let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
-            unsafe {
-                sgemm(
-                    blocksize,
-                    blocksize,
-                    blocksize,
-                    1.0,
-                    a_vec.as_ptr(),
-                    blocksize as isize,
-                    1,
-                    b_block_vec.as_ptr(),
-                    1,
-                    blocksize as isize,
-                    0.0,
-                    c_vec.as_mut_ptr(),
-                    blocksize as isize,
-                    1,
-                );
+    let _ = nblks_array.dist_iter().for_each_async(move |block| {
+        let b = b_clone.clone();
+        let a: ReadOnlyArray<f32> = a_clone.clone();
+        let c = c_clone.clone();
+        async move {
+            //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
+            // for j_blk in 0..p_blks {
+            // iterate over submatrix rows of b
+            let j_blk = block.j;
+            let k_blk = block.k;
+            // println!("j_blk: {}, k_blk: {}", j_blk, k_blk);
+            // let b = b_clone.clone();
+            let b_block = b
+                .onesided_iter() // OneSidedIterator (each pe will iterate through entirety of b)
+                .chunks(blocksize) //chunks columns by blocksize  -- manages efficent transfer and placement of data into a local memory region
+                .skip(k_blk * n_blks * blocksize + j_blk) // skip previously transfered submatrices
+                .step_by(n_blks) //grab chunk from next column in submatrix
+                // .buffered(100)
+                .into_stream() // convert to normal rust iterator
+                .take(blocksize) // we only need to take blocksize columns
+                .collect::<Vec<_>>()
+                .await; //gather local memory regions containing each columns data
+                        // println!("here");
+                        //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
+            let mut b_block_vec = vec![0.0; blocksize * blocksize];
+            for (j, col) in b_block.iter().enumerate() {
+                //(index, LocalMemRegion)
+                let b_block_col = &mut b_block_vec[j * blocksize..(j + 1) * blocksize];
+                b_block_col.copy_from_slice(unsafe { col.as_slice().unwrap() });
             }
+            let b_block_vec = Arc::new(b_block_vec); //we will be sharing this submatrix in multiple tasks
+                                                     //--------------
+
+            for i_blk in 0..m_blks_pe {
+                // iterate of the local submatrix rows of a
+                // let c = c_clone.clone();
+                let b_block_vec = b_block_vec.clone();
+                let a_vec = a
+                    .local_as_slice()
+                    .chunks(blocksize)
+                    .skip(i_blk * m_blks * blocksize + k_blk) //skip previously visited submatrices
+                    .step_by(m_blks) //grab chunk from the next row in submatrix
+                    .take(blocksize) //we only need to take blocksize rows
+                    .flatten()
+                    .copied() //get values instead of references
+                    .collect::<Vec<f32>>();
+                // a.dist_iter() //DistributedIterator (each pe will iterate through only its local data -- in parallel)
+                //     .chunks(blocksize) //chunks rows by blocksize
+                //     .skip(i_blk * m_blks * blocksize + k_blk) //skip previously visited submatrices
+                //     .step_by(m_blks) //grab chunk from the next row in submatrix
+                //     .take(blocksize) //we only need to take blocksize rows
+                //     .chunks(blocksize) //currently a "hack" for Iterate::collect()
+                //     .for_each(move |a_block| {
+                //         //iterate over local submatrices is submatrix row "i_blk"
+                //         //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
+                //         let mut a_vec = vec![0.0; blocksize * blocksize];
+                //         for (i, row) in a_block.enumerate() {
+                //             for (j, elem) in row.enumerate() {
+                //                 a_vec[i * blocksize + j] = *elem;
+                //             }
+                //         }
+                // println!("a_vec: {:?}", a_vec);
+                // -------------------------------
+                let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
+                unsafe {
+                    sgemm(
+                        blocksize,
+                        blocksize,
+                        blocksize,
+                        1.0,
+                        a_vec.as_ptr(),
+                        blocksize as isize,
+                        1,
+                        b_block_vec.as_ptr(),
+                        1,
+                        blocksize as isize,
+                        0.0,
+                        c_vec.as_mut_ptr(),
+                        blocksize as isize,
+                        1,
+                    );
+                }
 
-            let c_slice = c.mut_local_data();
-            // let _lock = LOCK.lock();
+                let c_slice = c.mut_local_data();
+                // let _lock = LOCK.lock();
 
-            for row in 0..blocksize {
-                let row_offset = (i_blk * blocksize + row) * n;
-                for col in 0..blocksize {
-                    let col_offset = j_blk * blocksize + col;
-                    c_slice
-                        .at(row_offset + col_offset)
-                        .fetch_add(c_vec[row * blocksize + col]);
-                    //we know all updates to c are local so directly update the raw data
-                    // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
-                    // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
+                for row in 0..blocksize {
+                    let row_offset = (i_blk * blocksize + row) * n;
+                    for col in 0..blocksize {
+                        let col_offset = j_blk * blocksize + col;
+                        c_slice
+                            .at(row_offset + col_offset)
+                            .fetch_add(c_vec[row * blocksize + col]);
+                        //we know all updates to c are local so directly update the raw data
+                        // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
+                        // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
+                    }
                 }
+                //});
             }
-            //});
         }
         // }
     });
diff --git a/examples/team_examples/random_team.rs b/examples/team_examples/random_team.rs
index 08481088..645f11df 100644
--- a/examples/team_examples/random_team.rs
+++ b/examples/team_examples/random_team.rs
@@ -194,11 +194,11 @@ fn main() {
         let world_c = world.clone();
         world.block_on(async move {
             for _ in 0..my_pe {
-                world_c.barrier();
+                world_c.async_barrier().await;
             }
             println!("[{:?}] sub_team_path: {:?}", my_pe, sub_team_path.await);
             for _ in my_pe..num_pes {
-                world_c.barrier();
+                world_c.async_barrier().await;
             }
         });
     } else {
diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index 96bd93ec..f930d59b 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -1687,9 +1687,6 @@ pub(crate) fn __derive_arrayops(input: TokenStream) -> TokenStream {
             use __lamellar::darc::prelude::*;
             use __lamellar::array::{
                 ArrayOpCmd,
-                OpResultOffsets,
-                PeOpResults,
-                OpResults,
                 IdxVal,
                 ReadOnlyByteArray,
                 UnsafeByteArray,
diff --git a/run_examples.sh b/run_examples.sh
index 5c53dfeb..c4fae52e 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -66,12 +66,17 @@ root=$PWD
 #  cd ..
 # done
 
+local_results_dir=async_backends
+results_dir=${output_dir}/rofiverbs_lamellae/${local_results_dir}
 ### test using rofi verbs lamellae
-rm -r ${output_dir}/rofiverbs_lamellae
+rm -r ${results_dir}
+
 rm -r rofiverbs_lamellae
-mkdir -p ${output_dir}/rofiverbs_lamellae
+mkdir -p rofiverbs_lamellae
+mkdir -p ${results_dir}
 ln -s ${output_dir}/rofiverbs_lamellae rofiverbs_lamellae
-cd rofiverbs_lamellae
+
+cd rofiverbs_lamellae/${local_results_dir}
 for toolchain in stable; do #nightly; do
   features=""
   if [ "${toolchain}" = "nightly" ]; then
@@ -114,7 +119,6 @@ for toolchain in stable; do #nightly; do
         #   done
         fi
       cd ..
-      sleep 5
       cur_tasks=`squeue -u frie869 | grep " R " | wc -l`
       while [ $cur_tasks -gt 3 ]; do
         cur_tasks=`squeue -u frie869 | grep " R " | wc -l`
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 0559dc29..1ab9dfc9 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -1126,6 +1126,8 @@ pub trait ActiveMessaging {
     ///```
     fn barrier(&self);
 
+    fn async_barrier(&self) -> impl Future<Output = ()> + Send;
+
     #[doc(alias("One-sided", "onesided"))]
     /// Run a future to completion on the current thread
     ///
@@ -1179,13 +1181,7 @@ pub trait ActiveMessaging {
 pub(crate) trait ActiveMessageEngine {
     async fn process_msg(self, am: Am, stall_mark: usize, immediate: bool);
 
-    async fn exec_msg(
-        self,
-        msg: Msg,
-        ser_data: SerializedData,
-        lamellae: Arc<Lamellae>,
-        scheduler: Arc<Executor>,
-    );
+    async fn exec_msg(self, msg: Msg, ser_data: SerializedData, lamellae: Arc<Lamellae>);
 
     fn get_team_and_world(
         &self,
diff --git a/src/active_messaging/batching.rs b/src/active_messaging/batching.rs
index 11882eb3..6ac3ed08 100644
--- a/src/active_messaging/batching.rs
+++ b/src/active_messaging/batching.rs
@@ -61,7 +61,7 @@ pub(crate) trait Batcher {
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
-    ) -> Vec<Am>;
+    );
 }
 
 #[derive(Debug, Clone)]
@@ -155,7 +155,7 @@ impl Batcher for BatcherType {
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
-    ) -> Vec<Am> {
+    ) {
         match self {
             BatcherType::Simple(batcher) => {
                 batcher.exec_batched_msg(msg, ser_data, lamellae, ame).await
diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index fbea7277..7f7a709b 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -56,6 +56,7 @@ impl SimpleBatcherInner {
 pub(crate) struct SimpleBatcher {
     batched_ams: Arc<Vec<SimpleBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
+    executor: Arc<Executor>,
 }
 
 #[async_trait]
@@ -92,17 +93,19 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_remote_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                SimpleBatcher::create_tx_task(batch).await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    SimpleBatcher::create_tx_task(batch).await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             // println!("remote size: {:?} ", size);
             // println!(
@@ -145,17 +148,19 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_rerturn_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                SimpleBatcher::create_tx_task(batch).await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    SimpleBatcher::create_tx_task(batch).await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             // println!("return size: {:?} {dst:?}",size);
             // println!(
@@ -200,17 +205,19 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_data_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                SimpleBatcher::create_tx_task(batch).await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    SimpleBatcher::create_tx_task(batch).await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             // println!("data size: {:?} {dst:?}",size);
             // println!(
@@ -242,17 +249,19 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_unit_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                SimpleBatcher::create_tx_task(batch).await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    SimpleBatcher::create_tx_task(batch).await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             // println!("unit size: {:?} ", size);
             // println!(
@@ -270,18 +279,17 @@ impl Batcher for SimpleBatcher {
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
-    ) -> Vec<Am> {
+    ) {
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("executing batched msg {:?}", data.len());
-        let mut return_ams = Vec::new();
         while i < data.len() {
             let cmd: Cmd = crate::deserialize(&data[i..i + *CMD_LEN], false).unwrap();
             i += *CMD_LEN;
             // let temp_i = i;
             // println!("cmd {:?}", cmd);
             match cmd {
-                Cmd::Am => return_ams.push(self.exec_am(&msg, data, &mut i, &lamellae, ame).await),
+                Cmd::Am => self.exec_am(&msg, data, &mut i, &lamellae, ame),
                 Cmd::ReturnAm => {
                     self.exec_return_am(&msg, data, &mut i, &lamellae, ame)
                         .await
@@ -293,13 +301,16 @@ impl Batcher for SimpleBatcher {
                 }
             }
         }
-        return_ams
     }
 }
 
 impl SimpleBatcher {
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn new(num_pes: usize, stall_mark: Arc<AtomicUsize>) -> SimpleBatcher {
+    pub(crate) fn new(
+        num_pes: usize,
+        stall_mark: Arc<AtomicUsize>,
+        executor: Arc<Executor>,
+    ) -> SimpleBatcher {
         let mut batched_ams = Vec::new();
         for pe in 0..num_pes {
             batched_ams.push(SimpleBatcherInner::new(Some(pe)));
@@ -308,13 +319,13 @@ impl SimpleBatcher {
         SimpleBatcher {
             batched_ams: Arc::new(batched_ams),
             stall_mark: stall_mark,
+            executor: executor,
         }
     }
 
     //#[tracing::instrument(skip_all)]
     async fn create_tx_task(batch: SimpleBatcherInner) {
         // println!("[{:?}] create_tx_task", std::thread::current().id());
-        async_std::task::yield_now().await; // force this to renter the task queue so other requests can hopefully come in before sending the batch
         let (buf, size) = batch.swap();
 
         if size > 0 {
@@ -509,14 +520,15 @@ impl SimpleBatcher {
     }
 
     // #[tracing::instrument(skip_all)]
-    async fn exec_am(
+    // async
+    fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
-    ) -> Am {
+    ) {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -540,25 +552,27 @@ impl SimpleBatcher {
         //     "[{:?}] simple batcher exec_am submit task",
         //     std::thread::current().id()
         // );
-        let am = match am
-            .exec(
-                team.team.world_pe,
-                team.team.num_world_pes,
-                false,
-                world.clone(),
-                team.clone(),
-            )
-            .await
-        {
-            LamellarReturn::Unit => Am::Unit(req_data),
-            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                panic!("Should not be returning local data or AM from remote  am");
-            }
-        };
-        // ame.process_msg(am, 0, false).await;
-        am
+        let ame = ame.clone();
+        self.executor.submit_task(async move {
+            let am = match am
+                .exec(
+                    team.team.world_pe,
+                    team.team.num_world_pes,
+                    false,
+                    world.clone(),
+                    team.clone(),
+                )
+                .await
+            {
+                LamellarReturn::Unit => Am::Unit(req_data),
+                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                    panic!("Should not be returning local data or AM from remote  am");
+                }
+            };
+            ame.process_msg(am, 0, false).await;
+        });
     }
 
     // #[tracing::instrument(skip_all)]
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index 43af51ed..356ac6bb 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -174,6 +174,7 @@ impl TeamAmBatcherInner {
 pub(crate) struct TeamAmBatcher {
     batched_ams: Arc<Vec<TeamAmBatcherInner>>,
     stall_mark: Arc<AtomicUsize>,
+    executor: Arc<Executor>,
 }
 
 #[async_trait]
@@ -204,23 +205,25 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                TeamAmBatcher::create_tx_task(
-                    batch,
-                    req_data.lamellae.clone(),
-                    req_data.team.arch.clone(),
-                    req_data.team.world_pe,
-                )
-                .await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    TeamAmBatcher::create_tx_task(
+                        batch,
+                        req_data.lamellae.clone(),
+                        req_data.team.arch.clone(),
+                        req_data.team.world_pe,
+                    )
+                    .await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!(
@@ -264,23 +267,25 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                TeamAmBatcher::create_tx_task(
-                    batch,
-                    req_data.lamellae.clone(),
-                    req_data.team.arch.clone(),
-                    req_data.team.world_pe,
-                )
-                .await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    TeamAmBatcher::create_tx_task(
+                        batch,
+                        req_data.lamellae.clone(),
+                        req_data.team.arch.clone(),
+                        req_data.team.world_pe,
+                    )
+                    .await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!(
@@ -331,23 +336,25 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                TeamAmBatcher::create_tx_task(
-                    batch,
-                    req_data.lamellae.clone(),
-                    req_data.team.arch.clone(),
-                    req_data.team.world_pe,
-                )
-                .await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    TeamAmBatcher::create_tx_task(
+                        batch,
+                        req_data.lamellae.clone(),
+                        req_data.team.arch.clone(),
+                        req_data.team.world_pe,
+                    )
+                    .await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("[{:?}] data size: {:?}", std::thread::current().id(), size);
@@ -380,23 +387,25 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
-                && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
-                && batch_id == batch.batch_id.load(Ordering::SeqCst)
-            {
-                stall_mark = cur_stall_mark.load(Ordering::Relaxed);
-                async_std::task::yield_now().await;
-            }
-            if batch_id == batch.batch_id.load(Ordering::SeqCst) {
-                //this batch is still valid
-                TeamAmBatcher::create_tx_task(
-                    batch,
-                    req_data.lamellae.clone(),
-                    req_data.team.arch.clone(),
-                    req_data.team.world_pe,
-                )
-                .await;
-            }
+            self.executor.submit_task(async move {
+                while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
+                    && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
+                    && batch_id == batch.batch_id.load(Ordering::SeqCst)
+                {
+                    stall_mark = cur_stall_mark.load(Ordering::Relaxed);
+                    async_std::task::yield_now().await;
+                }
+                if batch_id == batch.batch_id.load(Ordering::SeqCst) {
+                    //this batch is still valid
+                    TeamAmBatcher::create_tx_task(
+                        batch,
+                        req_data.lamellae.clone(),
+                        req_data.team.arch.clone(),
+                        req_data.team.world_pe,
+                    )
+                    .await;
+                }
+            });
         } else if size >= MAX_BATCH_SIZE {
             //batch is full, transfer now
             // println!("[{:?}] unit size: {:?}", std::thread::current().id(), size);
@@ -417,12 +426,11 @@ impl Batcher for TeamAmBatcher {
         ser_data: SerializedData,
         lamellae: Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
-    ) -> Vec<Am> {
+    ) {
         // println!("[{:?}] exec_batched_msg", std::thread::current().id());
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         // println!("i: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
-        let mut return_ams = Vec::new();
         while i < data.len() {
             // println!("\ti: {:?} dl {:?} cl {:?}", i, data.len(), *CMD_LEN);
             let batch: BatchHeader =
@@ -437,21 +445,21 @@ impl Batcher for TeamAmBatcher {
                 Cmd::Data => ame.exec_data_am(&msg, data, &mut i, &ser_data).await,
                 Cmd::Unit => ame.exec_unit_am(&msg, data, &mut i).await,
                 Cmd::BatchedMsg => {
-                    return_ams.append(
-                        &mut self
-                            .exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
-                            .await,
-                    );
+                    self.exec_batched_am(&msg, batch.cnt, data, &mut i, &lamellae, &ame)
+                        .await;
                 }
             }
         }
-        return_ams
     }
 }
 
 impl TeamAmBatcher {
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn new(num_pes: usize, stall_mark: Arc<AtomicUsize>) -> TeamAmBatcher {
+    pub(crate) fn new(
+        num_pes: usize,
+        stall_mark: Arc<AtomicUsize>,
+        executor: Arc<Executor>,
+    ) -> TeamAmBatcher {
         let mut batched_ams = Vec::new();
         for pe in 0..num_pes {
             batched_ams.push(TeamAmBatcherInner::new(Some(pe)));
@@ -460,6 +468,7 @@ impl TeamAmBatcher {
         TeamAmBatcher {
             batched_ams: Arc::new(batched_ams),
             stall_mark: stall_mark,
+            executor: executor,
         }
     }
     //#[tracing::instrument(skip_all)]
@@ -470,7 +479,6 @@ impl TeamAmBatcher {
         my_pe: usize,
     ) {
         // println!("[{:?}] create_tx_task", std::thread::current().id());
-        async_std::task::yield_now().await; // force this to renter the task queue so other requests can hopefully come in before sending the batch
         let (am_batch, return_am_batch, non_am_batch, mut size) = batch.swap();
         if size > 0 {
             if am_batch.len() > 0 {
@@ -720,8 +728,7 @@ impl TeamAmBatcher {
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
         ame: &RegisteredActiveMessages,
-    ) -> Vec<Am> {
-        let mut return_ams = Vec::new();
+    ) {
         // println!("exec_batched_am batch_cnt: {}", batch_cnt);
         for _team in 0..batch_cnt {
             let team_header: TeamHeader =
@@ -743,18 +750,18 @@ impl TeamAmBatcher {
                     //     batched_am_header.cmd
                     // );
                     match batched_am_header.cmd {
-                        Cmd::Am => return_ams.push(
+                        Cmd::Am => {
                             self.exec_am(
                                 msg,
                                 data,
                                 i,
                                 lamellae,
+                                ame,
                                 batched_am_header.am_id,
                                 world.clone(),
                                 team.clone(),
-                            )
-                            .await,
-                        ),
+                            );
+                        }
                         Cmd::ReturnAm => {
                             self.exec_return_am(
                                 msg,
@@ -766,7 +773,7 @@ impl TeamAmBatcher {
                                 world.clone(),
                                 team.clone(),
                             )
-                            .await
+                            .await;
                         }
                         _ => panic!("unhandled cmd"),
                     }
@@ -778,20 +785,20 @@ impl TeamAmBatcher {
         //     std::thread::current().id(),
         //     return_ams
         // );
-        return_ams
     }
 
     // #[tracing::instrument(skip_all)]
-    async fn exec_am(
+    fn exec_am(
         &self,
         msg: &Msg,
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
+        ame: &RegisteredActiveMessages,
         am_id: AmId,
         world: Arc<LamellarTeam>,
         team: Arc<LamellarTeam>,
-    ) -> Am {
+    ) {
         let req_id = crate::deserialize(&data[*i..*i + *REQ_ID_LEN], false).unwrap();
         *i += *REQ_ID_LEN;
         let am = AMS_EXECS.get(&am_id).unwrap()(&data[*i..], team.team.team_pe);
@@ -808,25 +815,27 @@ impl TeamAmBatcher {
             team_addr: team.team.remote_ptr_addr,
         };
 
-        let am = match am
-            .exec(
-                team.team.world_pe,
-                team.team.num_world_pes,
-                false,
-                world.clone(),
-                team.clone(),
-            )
-            .await
-        {
-            LamellarReturn::Unit => Am::Unit(req_data),
-            LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
-            LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
-            LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
-                panic!("Should not be returning local data or AM from remote  am");
-            }
-        };
-        am
-        // ame.process_msg(am, 0, false).await;
+        let ame = ame.clone();
+        self.executor.submit_task(async move {
+            let am = match am
+                .exec(
+                    team.team.world_pe,
+                    team.team.num_world_pes,
+                    false,
+                    world.clone(),
+                    team.clone(),
+                )
+                .await
+            {
+                LamellarReturn::Unit => Am::Unit(req_data),
+                LamellarReturn::RemoteData(data) => Am::Data(req_data, data),
+                LamellarReturn::RemoteAm(am) => Am::Return(req_data, am),
+                LamellarReturn::LocalData(_) | LamellarReturn::LocalAm(_) => {
+                    panic!("Should not be returning local data or AM from remote  am");
+                }
+            };
+            ame.process_msg(am, 0, false).await;
+        });
     }
 
     // #[tracing::instrument(skip_all)]
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index 994969f6..e16755d7 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -64,6 +64,7 @@ crate::inventory::collect!(RegisteredAm);
 #[derive(Debug, Clone)]
 pub(crate) struct RegisteredActiveMessages {
     batcher: BatcherType,
+    executor: Arc<Executor>,
 }
 
 lazy_static! {
@@ -100,6 +101,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
     // #[tracing::instrument(skip_all)]
     async fn process_msg(self, am: Am, stall_mark: usize, immediate: bool) {
         // println!("[{:?}] process_msg {am:?}", std::thread::current().id());
+
         match am {
             Am::All(req_data, am) => {
                 // println!("{:?}",am.get_id());
@@ -109,27 +111,33 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 if req_data.team.lamellae.backend() != Backend::Local
                     && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err())
                 {
-                    // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
-                    if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
-                        self.batcher
-                            .add_remote_am_to_batch(
-                                req_data.clone(),
-                                am.clone(),
-                                am_id,
-                                am_size,
-                                stall_mark,
-                            )
-                            .await;
-                    } else {
-                        // println!(
-                        //     "[{:?}] {:?} all {:?}",
-                        //     std::thread::current().id(),
-                        //     am_id,
-                        //     am_size
-                        // );
-                        self.send_am(req_data.clone(), am.clone(), am_id, am_size, Cmd::Am)
-                            .await;
-                    }
+                    let ame = self.clone();
+                    let req_data_clone = req_data.clone();
+                    let am_clone = am.clone();
+                    self.executor.submit_task(async move {
+                        //spawn a task so that we can the execute the local am immediately
+                        // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
+                        if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
+                            ame.batcher
+                                .add_remote_am_to_batch(
+                                    req_data_clone.clone(),
+                                    am_clone.clone(),
+                                    am_id,
+                                    am_size,
+                                    stall_mark,
+                                )
+                                .await;
+                        } else {
+                            // println!(
+                            //     "[{:?}] {:?} all {:?}",
+                            //     std::thread::current().id(),
+                            //     am_id,
+                            //     am_size
+                            // );
+                            ame.send_am(req_data_clone, am_clone, am_id, am_size, Cmd::Am)
+                                .await;
+                        }
+                    });
                 }
                 let world = LamellarTeam::new(None, req_data.world.clone(), true);
                 let team = LamellarTeam::new(Some(world.clone()), req_data.team.clone(), true);
@@ -216,21 +224,13 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
     }
 
     //#[tracing::instrument(skip_all)]
-    async fn exec_msg(
-        self,
-        msg: Msg,
-        ser_data: SerializedData,
-        lamellae: Arc<Lamellae>,
-        executor: Arc<Executor>,
-    ) {
+    async fn exec_msg(self, msg: Msg, ser_data: SerializedData, lamellae: Arc<Lamellae>) {
         // println!("[{:?}] exec_msg {:?}", std::thread::current().id(), msg.cmd);
         let data = ser_data.data_as_bytes();
         let mut i = 0;
         match msg.cmd {
             Cmd::Am => {
-                let return_am = self.exec_am(&msg, data, &mut i, &lamellae).await;
-                let process_task = self.process_msg(return_am, 0, false);
-                executor.submit_task(process_task);
+                self.exec_am(&msg, data, &mut i, &lamellae).await;
             }
             Cmd::ReturnAm => {
                 self.exec_return_am(&msg, data, &mut i, &lamellae).await;
@@ -242,15 +242,9 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 self.exec_unit_am(&msg, data, &mut i).await;
             }
             Cmd::BatchedMsg => {
-                let ams = self
-                    .batcher
+                self.batcher
                     .exec_batched_msg(msg, ser_data, lamellae, &self)
                     .await;
-                let am_tasks = futures::stream::FuturesUnordered::new();
-                for am in ams.into_iter() {
-                    am_tasks.push(self.clone().process_msg(am, 0, false));
-                }
-                executor.submit_task(futures::future::join_all(am_tasks));
             }
         }
     }
@@ -258,8 +252,8 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
 
 impl RegisteredActiveMessages {
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn new(batcher: BatcherType) -> RegisteredActiveMessages {
-        RegisteredActiveMessages { batcher: batcher }
+    pub(crate) fn new(batcher: BatcherType, executor: Arc<Executor>) -> RegisteredActiveMessages {
+        RegisteredActiveMessages { batcher, executor }
     }
 
     //#[tracing::instrument(skip_all)]
@@ -397,6 +391,7 @@ impl RegisteredActiveMessages {
         data.unwrap()
     }
 
+    //we can remove this by cloning self and submitting to the executor
     #[async_recursion]
     //#[tracing::instrument(skip_all)]
     pub(crate) async fn exec_local_am(
@@ -447,7 +442,7 @@ impl RegisteredActiveMessages {
         data: &[u8],
         i: &mut usize,
         lamellae: &Arc<Lamellae>,
-    ) -> Am {
+    ) {
         // println!("exec_am");
         let am_header: AmHeader =
             crate::deserialize(&data[*i..*i + *AM_HEADER_LEN], false).unwrap();
@@ -485,8 +480,12 @@ impl RegisteredActiveMessages {
                 panic!("Should not be returning local data or AM from remote  am");
             }
         };
-        am
-        // self.process_msg(am, 0, false).await; //0 just means we will force a stall_count loop
+        let ame = self.clone();
+        self.executor.submit_task(async move {
+            ame.process_msg(am, 0, false).await;
+        });
+        //compare against:
+        // ame.process_msg(am, 0, true).await;
     }
 
     //#[tracing::instrument(skip_all)]
diff --git a/src/array.rs b/src/array.rs
index 66e4f7db..ab1c23a0 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -120,9 +120,9 @@ pub use r#unsafe::{
 };
 pub(crate) mod read_only;
 pub use read_only::{
-    ReadOnlyArray, ReadOnlyArrayOpBuf,
-    /*ReadOnlyArrayMultiMultiOps, ReadOnlyArrayMultiSingleOps,*/ ReadOnlyByteArray,
-    ReadOnlyByteArrayWeak,
+    ReadOnlyArray,
+    /*ReadOnlyArrayOpBuf, ReadOnlyArrayMultiMultiOps, ReadOnlyArrayMultiSingleOps,*/
+    ReadOnlyByteArray, ReadOnlyByteArrayWeak,
 };
 
 // pub(crate) mod local_only;
@@ -194,24 +194,26 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
-// impl Dist for bool {}
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+impl Dist for bool {}
 
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-lamellar_impl::generate_ops_for_bool_rt!();
+// lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
@@ -260,6 +262,8 @@ pub trait LamellarArrayRequest: Sync + Send {
     type Output;
     async fn into_future(mut self: Box<Self>) -> Self::Output;
     fn wait(self: Box<Self>) -> Self::Output;
+    fn ready(&self) -> bool;
+    fn set_waker(&mut self, waker: futures::task::Waker);
 }
 
 struct ArrayRdmaHandle {
@@ -280,6 +284,17 @@ impl LamellarArrayRequest for ArrayRdmaHandle {
         }
         ()
     }
+    fn ready(&self) -> bool {
+        self.reqs.iter().all(|req| {
+            // println!("req: {:?}", req.ready());
+            req.ready()
+        })
+    }
+    fn set_waker(&mut self, waker: futures::task::Waker) {
+        for req in self.reqs.iter_mut() {
+            req.set_waker(waker.clone());
+        }
+    }
 }
 
 struct ArrayRdmaAtHandle<T: Dist> {
@@ -301,6 +316,14 @@ impl<T: Dist> LamellarArrayRequest for ArrayRdmaAtHandle<T> {
         }
         unsafe { self.buf.as_slice().expect("Data should exist on PE")[0] }
     }
+    fn ready(&self) -> bool {
+        self.reqs.iter().all(|req| req.ready())
+    }
+    fn set_waker(&mut self, waker: futures::task::Waker) {
+        for req in self.reqs.iter_mut() {
+            req.set_waker(waker.clone());
+        }
+    }
 }
 
 /// Registered memory regions that can be used as input to various LamellarArray RDMA operations.
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index e051719b..a5a7c099 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -566,6 +566,10 @@ impl<T: Dist> GenericAtomicArray<T> {
             .expect("invalid local index");
         self.locks[index].lock()
     }
+
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_  {
+        self.array.async_barrier()
+    }
 }
 
 impl<T: Dist + 'static> GenericAtomicArray<T> {
@@ -736,6 +740,7 @@ impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
     fn barrier(&self) {
         self.array.barrier();
     }
+
     fn wait_all(&self) {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 48bf357b..e9cc9662 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -612,6 +612,10 @@ impl<T: Dist> GlobalLockArray<T> {
         // println!("GlobalLock into_read_only");
         self.array.into()
     }
+
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+        self.array.async_barrier()
+    }
 }
 
 impl<T: Dist + 'static> GlobalLockArray<T> {
@@ -922,6 +926,12 @@ impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
     fn get(&self) -> Self::Output {
         self.req.get()
     }
+    fn ready(&self) -> bool {
+        self.req.ready()
+    }
+    fn set_waker(&mut self, waker: futures::task::Waker) {
+        self.req.set_waker(waker)
+    }
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 66c4434d..53d40f27 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -79,7 +79,7 @@ impl LamellarAm for UpdateCntAm {
 }
 
 impl RemoteIterCountHandle {
-    async fn reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
+    async fn async_reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
         self.team
             .exec_am_all(UpdateCntAm {
                 remote_cnt: local_cnt,
@@ -90,6 +90,16 @@ impl RemoteIterCountHandle {
         self.team.async_barrier().await;
         cnt.load(Ordering::SeqCst)
     }
+
+    fn reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
+        self.team.exec_am_all(UpdateCntAm {
+            remote_cnt: local_cnt,
+            cnt: cnt.clone(),
+        });
+        self.team.wait_all();
+        self.team.tasking_barrier();
+        cnt.load(Ordering::SeqCst)
+    }
 }
 
 #[doc(hidden)]
@@ -105,7 +115,7 @@ impl IterRequest for RemoteIterCountHandle {
             .into_iter()
             .sum::<usize>();
         // println!("count: {} {:?}", count, std::thread::current().id());
-        self.reduce_remote_counts(count, cnt).await
+        self.async_reduce_remote_counts(count, cnt).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         self.team.tasking_barrier();
@@ -116,10 +126,7 @@ impl IterRequest for RemoteIterCountHandle {
             .map(|req| req.get())
             .into_iter()
             .sum::<usize>();
-        self.team
-            .scheduler
-            .clone()
-            .block_on(self.reduce_remote_counts(count, cnt))
+        self.reduce_remote_counts(count, cnt)
     }
 }
 
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index c74a5ed5..94fd66f8 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::{private::*, IterRequest};
-use crate::array::{ArrayOps, Distribution, LamellarArray, UnsafeArray};
+use crate::array::{ArrayOps, Distribution, UnsafeArray};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::Dist;
@@ -80,6 +80,21 @@ where
     T: Dist + ArrayOps,
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
+    async fn async_reduce_remote_vals(&self, local_val: Option<T>) -> Option<T> {
+        self.team.async_barrier().await;
+        let local_vals =
+            UnsafeArray::<Option<T>>::new(&self.team, self.team.num_pes, Distribution::Block);
+        unsafe {
+            local_vals.local_as_mut_slice()[0] = local_val;
+        };
+        local_vals.async_barrier().await;
+        let buffered_iter = unsafe { local_vals.buffered_onesided_iter(self.team.num_pes) };
+        buffered_iter
+            .into_iter()
+            .filter_map(|&res| res)
+            .reduce(self.op.clone())
+    }
+
     fn reduce_remote_vals(&self, local_val: Option<T>) -> Option<T> {
         self.team.tasking_barrier();
         let local_vals =
@@ -87,7 +102,7 @@ where
         unsafe {
             local_vals.local_as_mut_slice()[0] = local_val;
         };
-        local_vals.barrier();
+        local_vals.tasking_barrier();
         let buffered_iter = unsafe { local_vals.buffered_onesided_iter(self.team.num_pes) };
         buffered_iter
             .into_iter()
@@ -110,7 +125,7 @@ where
             .into_iter()
             .filter_map(|res| res)
             .reduce(self.op.clone());
-        self.reduce_remote_vals(local_val)
+        self.async_reduce_remote_vals(local_val).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         let local_val = self
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index 14c17113..0d260a54 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::{private::*, IterRequest};
-use crate::array::{ArrayOps, Distribution, LamellarArray, UnsafeArray};
+use crate::array::{ArrayOps, Distribution, UnsafeArray};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::Dist;
@@ -69,11 +69,20 @@ impl<T> RemoteIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
 {
+    async fn async_reduce_remote_vals(&self, local_sum: T, local_sums: UnsafeArray<T>) -> T {
+        unsafe {
+            local_sums.local_as_mut_slice()[0] = local_sum;
+        };
+        local_sums.async_barrier().await;
+        let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
+        buffered_iter.into_iter().map(|&e| e).sum()
+    }
+
     fn reduce_remote_vals(&self, local_sum: T, local_sums: UnsafeArray<T>) -> T {
         unsafe {
             local_sums.local_as_mut_slice()[0] = local_sum;
         };
-        local_sums.barrier();
+        local_sums.tasking_barrier();
         let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
         buffered_iter.into_iter().map(|&e| e).sum()
     }
@@ -93,7 +102,7 @@ where
             .await
             .into_iter()
             .sum();
-        self.reduce_remote_vals(local_sum, local_sums)
+        self.async_reduce_remote_vals(local_sum, local_sums).await
     }
     fn wait(mut self: Box<Self>) -> Self::Output {
         let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index 5d02d3a3..7265bcd7 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -25,56 +25,68 @@ use zip::*;
 // mod buffered;
 // use buffered::*;
 
-use crate::array::{LamellarArray, LamellarArrayInternalGet};
+use crate::array::{LamellarArray, LamellarArrayInternalGet, LamellarArrayRequest};
 use crate::memregion::{Dist, OneSidedMemoryRegion, RegisteredMemoryRegion, SubRegion};
 
 use crate::LamellarTeamRT;
 
 // use async_trait::async_trait;
 // use futures::{ready, Stream};
+use futures::Stream;
 use pin_project::pin_project;
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::ptr::NonNull;
 use std::sync::Arc;
-// use std::task::{Context, Poll};
+use std::task::{Context, Poll};
 
 //TODO: Think about an active message based method for transfering data that performs data reducing iterators before sending
 // i.e. for something like step_by(N) we know that only every N elements actually needs to get sent...
+pub(crate) mod private {
+    use crate::array::LamellarArrayInternalGet;
+    use crate::memregion::Dist;
+    use std::pin::Pin;
+    use std::task::{Context, Poll};
+    pub trait OneSidedIteratorInner {
+        /// The type of item self distributed iterator produces
+        type Item: Send;
 
+        /// The underlying element type of the Array self iterator belongs to
+        type ElemType: Dist + 'static;
+
+        /// The orgininal array that created self iterator
+        type Array: LamellarArrayInternalGet<Self::ElemType> + Send;
+
+        fn init(&mut self);
+        /// Return the next element in the iterator, otherwise return None
+        fn next(&mut self) -> Option<Self::Item>;
+
+        fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>>;
+
+        /// advance the internal iterator localtion by count elements
+        fn advance_index(&mut self, count: usize);
+
+        fn advance_index_pin(self: Pin<&mut Self>, count: usize);
+
+        /// Return the original array self distributed iterator belongs too
+        fn array(&self) -> Self::Array;
+
+        /// The size of the returned Item
+        fn item_size(&self) -> usize {
+            std::mem::size_of::<Self::Item>()
+        }
+    }
+}
 /// An interface for dealing with one sided iterators of LamellarArrays
 ///
-/// The functions in this trait are available on all [one-sided iterators](crate::array::iterator::one_sided_iterator)
+/// The functions in self trait are available on all [one-sided iterators](crate::array::iterator::one_sided_iterator)
 /// (which run over the data of a distributed array on a single PE).  Typically
 /// the provided iterator functions are optimized versions of the standard Iterator equivalents to reduce data movement assoicated with handling distributed arrays
 ///
 /// Additonaly functionality can be found by converting these iterators into Standard Iterators (with potential loss in data movement optimizations)
 ///
 /// Note that currently One Sided Iterators will iterate over the distributed array serially, we are planning a parallel version in a future release.
-pub trait OneSidedIterator {
-    /// The type of item this distributed iterator produces
-    type Item: Send;
-
-    /// The underlying element type of the Array this iterator belongs to
-    type ElemType: Dist + 'static;
-
-    /// The orgininal array that created this iterator
-    type Array: LamellarArrayInternalGet<Self::ElemType> + Send;
-
-    /// Return the next element in the iterator, otherwise return None
-    fn next(&mut self) -> Option<Self::Item>;
-
-    /// advance the internal iterator localtion by count elements
-    fn advance_index(&mut self, count: usize);
-
-    /// Return the original array this distributed iterator belongs too
-    fn array(&self) -> Self::Array;
-
-    /// The size of the returned Item
-    fn item_size(&self) -> usize {
-        std::mem::size_of::<Self::Item>()
-    }
-
+pub trait OneSidedIterator: private::OneSidedIteratorInner {
     // /// Buffer (fetch/get) the next element in the array into the provided memory region (transferring data from a remote PE if necessary)
     // fn buffered_next(
     //     &mut self,
@@ -99,7 +111,7 @@ pub trait OneSidedIterator {
     /// array.wait_all();
     /// if my_pe == 0 {
     ///     for chunk in array.onesided_iter().chunks(5).into_iter() { //convert into a standard Iterator
-    ///         // SAFETY: chunk is safe in this instance because this will be the only handle to the memory region,
+    ///         // SAFETY: chunk is safe in self instance because self will be the only handle to the memory region,
     ///         // and the runtime has verified that data is already placed in it
     ///         println!("PE: {my_pe} chunk: {:?}",unsafe {chunk.as_slice()});
     ///     }
@@ -182,7 +194,7 @@ pub trait OneSidedIterator {
         StepBy::new(self, step_size)
     }
 
-    /// Iterates over tuples `(A,B)` where the `A` items are from this iterator and the `B` items are from the iter in the argument.
+    /// Iterates over tuples `(A,B)` where the `A` items are from self iterator and the `B` items are from the iter in the argument.
     /// If the two iterators or of unequal length, the returned iterator will be equal in length to the shorter of the two.
     ///
     /// # Examples
@@ -230,7 +242,7 @@ pub trait OneSidedIterator {
     //     Buffered::new(self, buf_size)
     // }
 
-    /// Convert this one-sided iterator into a standard Rust Iterator, enabling one to use any of the functions available on `Iterator`s
+    /// Convert self one-sided iterator into a standard Rust Iterator, enabling one to use any of the functions available on `Iterator`s
     ///
     /// # Examples
     ///```
@@ -250,12 +262,28 @@ pub trait OneSidedIterator {
     ///```text
     /// Sum: 2.0
     ///```
-    fn into_iter(self) -> OneSidedIteratorIter<Self>
+    fn into_iter(mut self) -> OneSidedIteratorIter<Self>
     where
         Self: Sized + Send,
     {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            println!(
+                "[LAMELLAR WARNING] Trying to convert a lamellar one sided iterator into a standard iterator within a worker thread {:?} self may result in deadlock.
+                 Please use into_stream() instead",
+                std::backtrace::Backtrace::capture()
+            )
+        }
+        self.init();
         OneSidedIteratorIter { iter: self }
     }
+
+    fn into_stream(mut self) -> OneSidedStream<Self>
+    where
+        Self: Sized + Send,
+    {
+        self.init();
+        OneSidedStream { iter: self }
+    }
 }
 
 /// An immutable standard Rust Iterator backed by a [OneSidedIterator](crate::array::iterator::one_sided_iterator).
@@ -284,12 +312,45 @@ impl<I> Iterator for OneSidedIteratorIter<I>
 where
     I: OneSidedIterator,
 {
-    type Item = <I as OneSidedIterator>::Item;
+    type Item = <I as private::OneSidedIteratorInner>::Item;
     fn next(&mut self) -> Option<Self::Item> {
         self.iter.next()
     }
 }
 
+#[pin_project]
+pub struct OneSidedStream<I> {
+    #[pin]
+    pub(crate) iter: I,
+}
+
+impl<I> Stream for OneSidedStream<I>
+where
+    I: OneSidedIterator,
+{
+    type Item = <I as private::OneSidedIteratorInner>::Item;
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // let me = self.get_mut();
+        // println!("OneSidedStream polling");
+        let this = self.project();
+        let res = this.iter.poll_next(cx);
+        match res {
+            Poll::Ready(Some(res)) => {
+                // println!("OneSidedStream ready");
+                Poll::Ready(Some(res))
+            }
+            Poll::Ready(None) => {
+                // println!("OneSidedStream finished");
+                Poll::Ready(None)
+            }
+            Poll::Pending => {
+                // println!("OneSidedStream pending");
+                Poll::Pending
+            }
+        }
+    }
+}
+
 struct SendNonNull<T: Dist + 'static>(NonNull<T>);
 
 // This is safe because Lamellar Arrays are allocated from Rofi, and thus cannot be moved
@@ -316,9 +377,17 @@ pub struct OneSidedIter<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> {
     index: usize,
     buf_index: usize,
     ptr: SendNonNull<T>,
+    state: State,
     _marker: PhantomData<&'a T>,
 }
 
+pub(crate) enum State {
+    // Ready,
+    Pending(Box<dyn LamellarArrayRequest<Output = ()>>),
+    Buffered,
+    Finished,
+}
+
 impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> OneSidedIter<'a, T, A> {
     pub(crate) fn new(
         array: A,
@@ -327,8 +396,9 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> OneSidedIter<'a, T,
     ) -> OneSidedIter<'a, T, A> {
         let buf_0 = team.alloc_one_sided_mem_region(buf_size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
-        // but safe with respect to the buf_0 as this is the only reference
-        unsafe { array.internal_get(0, &buf_0).wait() };
+        // but safe with respect to the buf_0 as self is the only reference
+
+        // let req = unsafe { array.internal_get(0, &buf_0) };
         let ptr = unsafe {
             SendNonNull(
                 NonNull::new(buf_0.as_mut_ptr().expect("data should be local"))
@@ -341,6 +411,7 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> OneSidedIter<'a, T,
             index: 0,
             buf_index: 0,
             ptr: ptr,
+            state: State::Finished,
             _marker: PhantomData,
         };
 
@@ -350,51 +421,264 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> OneSidedIter<'a, T,
 
 impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send> OneSidedIterator
     for OneSidedIter<'a, T, A>
+{
+}
+
+impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
+    private::OneSidedIteratorInner for OneSidedIter<'a, T, A>
 {
     type ElemType = T;
     type Item = &'a T;
     type Array = A;
+
+    fn init(&mut self) {
+        let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+        self.state = State::Pending(req);
+    }
+
+    // fn next(&mut self) -> Option<Self::Item> {
+    //     let mut cur_state = State::Finished;
+    //     std::mem::swap(&mut self.state, &mut cur_state);
+    //     match cur_state {
+    //         State::Pending(req) => {
+    //             req.wait(); //need to wait here because we use the same underlying buffer
+    //             if self.index + 1 < self.array.len() {
+    //                 // still have remaining elements
+    //                 self.index += 1;
+    //                 let buf_index = self.buf_index as isize;
+    //                 self.buf_index += 1;
+    //                 if self.buf_index == self.buf_0.len() {
+    //                     //prefetch the next data
+    //                     self.buf_index = 0;
+    //                     if self.index + self.buf_0.len() < self.array.len() {
+    //                         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+    //                         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
+    //                         let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+    //                         self.state = State::Pending(req);
+    //                     } else if self.index < self.array.len() {
+    //                         let sub_region =
+    //                             self.buf_0.sub_region(0..(self.array.len() - self.index));
+    //                         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+    //                         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
+    //                         // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
+    //                         let req = unsafe { self.array.internal_get(self.index, sub_region) };
+    //                         self.state = State::Pending(req);
+    //                     } else {
+    //                         self.state = State::Finished;
+    //                     }
+    //                 }
+    //             } else {
+    //                 self.state = State::Finished;
+    //             };
+    //             unsafe { self.ptr.0.as_ptr().offset(buf_index).as_ref() } //this is an option
+    //         }
+    //         State::Buffered => {
+    //             self.state = State::Finished;
+    //             unsafe { self.ptr.0.as_ptr().offset(self.buf_index as isize).as_ref() }
+    //         }
+    //         State::Finished => None,
+    //     }
+    // }
+
     fn next(&mut self) -> Option<Self::Item> {
-        // println!("next {:?} {:?} {:?} {:?}",self.index,self.array.len(),self.buf_index,self.buf_0.len());
-        let res = if self.index < self.array.len() {
-            if self.buf_index == self.buf_0.len() {
-                // println!("need to get new data");
-                //need to get new data
-                self.buf_index = 0;
-                // self.fill_buffer(self.index);
-                if self.index + self.buf_0.len() < self.array.len() {
-                    // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
-                    // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
+        let mut cur_state = State::Finished;
+        std::mem::swap(&mut self.state, &mut cur_state);
+        match cur_state {
+            State::Pending(req) => {
+                req.wait();
+                self.state = State::Buffered;
+                self.index += 1;
+                self.buf_index += 1;
+                unsafe {
+                    self.ptr
+                        .0
+                        .as_ptr()
+                        .offset(self.buf_index as isize - 1)
+                        .as_ref()
+                }
+            }
+            State::Buffered => {
+                //once here the we never go back to pending
+                if self.index < self.array.len() {
+                    if self.buf_index == self.buf_0.len() {
+                        //need to get new data
+                        self.buf_index = 0;
+                        if self.index + self.buf_0.len() < self.array.len() {
+                            // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+                            // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
+                            unsafe {
+                                self.array.internal_get(self.index, &self.buf_0).wait();
+                            }
+                        } else {
+                            let sub_region =
+                                self.buf_0.sub_region(0..(self.array.len() - self.index));
+                            // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+                            // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
+                            // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
+                            unsafe {
+                                self.array.internal_get(self.index, sub_region).wait();
+                            }
+                        }
+                    }
+                    self.index += 1;
+                    self.buf_index += 1;
                     unsafe {
-                        self.array.internal_get(self.index, &self.buf_0).wait();
+                        self.ptr
+                            .0
+                            .as_ptr()
+                            .offset(self.buf_index as isize - 1)
+                            .as_ref()
                     }
                 } else {
-                    let sub_region = self.buf_0.sub_region(0..(self.array.len() - self.index));
-                    // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
-                    // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
-                    // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
+                    self.state = State::Finished;
+                    None
+                }
+            }
+            State::Finished => None,
+        }
+    }
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let mut cur_state = State::Finished;
+        std::mem::swap(&mut self.state, &mut cur_state);
+        let res = match cur_state {
+            State::Pending(mut req) => {
+                if !req.ready() {
+                    req.set_waker(cx.waker().clone());
+                    self.state = State::Pending(req);
+                    return Poll::Pending;
+                } else {
+                    self.state = State::Buffered;
+                    self.index += 1;
+                    self.buf_index += 1;
                     unsafe {
-                        self.array.internal_get(self.index, sub_region).wait();
+                        self.ptr
+                            .0
+                            .as_ptr()
+                            .offset(self.buf_index as isize - 1)
+                            .as_ref()
                     }
                 }
             }
-            // self.spin_for_valid(self.buf_index);
-            self.index += 1;
-            self.buf_index += 1;
-            unsafe {
-                self.ptr
-                    .0
-                    .as_ptr()
-                    .offset(self.buf_index as isize - 1)
-                    .as_ref()
+            State::Buffered => {
+                if self.index < self.array.len() {
+                    if self.buf_index == self.buf_0.len() {
+                        //need to get new data
+                        self.buf_index = 0;
+                        let mut req = if self.index + self.buf_0.len() < self.array.len() {
+                            // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+                            // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
+                            unsafe { self.array.internal_get(self.index, &self.buf_0) }
+                        } else {
+                            let sub_region =
+                                self.buf_0.sub_region(0..(self.array.len() - self.index));
+                            // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+                            // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
+                            // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
+                            unsafe { self.array.internal_get(self.index, sub_region) }
+                        };
+                        req.set_waker(cx.waker().clone());
+                        self.state = State::Pending(req);
+
+                        return Poll::Pending;
+                    }
+                    self.index += 1;
+                    self.buf_index += 1;
+                    unsafe {
+                        self.ptr
+                            .0
+                            .as_ptr()
+                            .offset(self.buf_index as isize - 1)
+                            .as_ref()
+                    }
+                } else {
+                    self.state = State::Finished;
+                    None
+                }
             }
-        } else {
-            None
+            State::Finished => None,
         };
-        res
+        Poll::Ready(res)
     }
 
+    // fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+    //     let mut cur_state = State::Finished;
+    //     std::mem::swap(&mut self.state, &mut cur_state);
+    //     let res = match cur_state {
+    //         State::Pending(mut req) => {
+    //             if !req.ready() {
+    //                 req.set_waker(cx.waker().clone());
+    //                 self.state = State::Pending(req);
+    //                 return Poll::Pending;
+    //             }
+
+    //             let res = if self.index + 1 < self.array.len() {
+    //                 self.index += 1;
+    //                 let buf_index = self.buf_index as isize;
+    //                 self.buf_index += 1;
+    //                 if self.buf_index == self.buf_0.len() {
+    //                     //prefetch the next data
+    //                     self.buf_index = 0;
+    //                     // self.fill_buffer(self.index);
+    //                     if self.index + self.buf_0.len() < self.array.len() {
+    //                         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+    //                         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
+    //                         let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+    //                         self.state = State::Pending(req);
+    //                     } else if self.index < self.array.len() {
+    //                         let sub_region =
+    //                             self.buf_0.sub_region(0..(self.array.len() - self.index));
+    //                         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+    //                         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
+    //                         // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
+    //                         let req = unsafe { self.array.internal_get(self.index, sub_region) };
+    //                         self.state = State::Pending(req);
+    //                     } else {
+    //                         self.state = State::Finished;
+    //                     }
+    //                 }
+    //                 // self.spin_for_valid(self.buf_index);
+
+    //                 unsafe { self.ptr.0.as_ptr().offset(buf_index).as_ref() }
+    //             } else {
+    //                 self.state = State::Finished;
+    //                 None
+    //             };
+    //             Poll::Ready(res)
+    //         }
+    //         State::Finished => Poll::Ready(None),
+    //     };
+    //     res
+    // }
+
     fn advance_index(&mut self, count: usize) {
+        let this = Pin::new(self);
+        this.advance_index_pin(count);
+        // self.index += count;
+        // self.buf_index += count;
+        // if self.buf_index == self.buf_0.len() {
+        //     self.buf_index = 0;
+        //     // self.fill_buffer(0);
+        //     if self.index + self.buf_0.len() < self.array.len() {
+        //         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+        //         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
+        //         unsafe {
+        //             self.array.internal_get(self.index, &self.buf_0).wait();
+        //         }
+        //     } else {
+        //         let sub_region = self.buf_0.sub_region(0..(self.array.len() - self.index));
+        //         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
+        //         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
+        //         // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
+        //         unsafe {
+        //             self.array.internal_get(self.index, sub_region).wait();
+        //         }
+        //     }
+        // }
+    }
+
+    fn advance_index_pin(mut self: Pin<&mut Self>, count: usize) {
+        // let this = self.as_mut().project();
         self.index += count;
         self.buf_index += count;
         if self.buf_index == self.buf_0.len() {
@@ -402,18 +686,16 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send> OneSi
             // self.fill_buffer(0);
             if self.index + self.buf_0.len() < self.array.len() {
                 // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
-                // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
-                unsafe {
-                    self.array.internal_get(self.index, &self.buf_0).wait();
-                }
+                // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
+                let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+                self.state = State::Pending(req);
             } else {
                 let sub_region = self.buf_0.sub_region(0..(self.array.len() - self.index));
                 // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
-                // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
+                // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
                 // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
-                unsafe {
-                    self.array.internal_get(self.index, sub_region).wait();
-                }
+                let req = unsafe { self.array.internal_get(self.index, sub_region) };
+                self.state = State::Pending(req);
             }
         }
     }
diff --git a/src/array/iterator/one_sided_iterator/buffered.rs b/src/array/iterator/one_sided_iterator/buffered.rs
index 09650d96..0a702176 100644
--- a/src/array/iterator/one_sided_iterator/buffered.rs
+++ b/src/array/iterator/one_sided_iterator/buffered.rs
@@ -24,6 +24,13 @@ where
             OneSidedMemoryRegion<u8>,
         )>,
     >,
+    state: BufferedState,
+}
+
+enum BufferedState {
+    Ready,
+    Pending,
+    Finished,
 }
 
 impl<I> Buffered<I>
@@ -41,6 +48,7 @@ where
             buf_size: buf_size,
             // buf: mem_region,
             reqs: VecDeque::new(),
+            state: BufferedState::Pending,
         };
         for _ in 0..buf.buf_size {
             buf.initiate_buffer();
@@ -116,6 +124,10 @@ where
             None
         }
     }
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Poll::Ready(None)
+    }
     fn advance_index(&mut self, count: usize) {
         // println!("advance_index {:?} {:?} {:?} {:?}",self.index, count, count*self.chunk_size,self.array.len());
         self.iter.advance_index(count);
diff --git a/src/array/iterator/one_sided_iterator/chunks.rs b/src/array/iterator/one_sided_iterator/chunks.rs
index 81acbc0a..5ae011e8 100644
--- a/src/array/iterator/one_sided_iterator/chunks.rs
+++ b/src/array/iterator/one_sided_iterator/chunks.rs
@@ -1,4 +1,5 @@
-use crate::array::iterator::one_sided_iterator::*;
+use crate::array::iterator::one_sided_iterator::{private::*, *};
+
 // use crate::array::LamellarArrayRequest;
 // use crate::LamellarArray;
 use crate::memregion::OneSidedMemoryRegion;
@@ -15,6 +16,15 @@ where
     iter: I,
     index: usize,
     chunk_size: usize,
+    state: ChunkState<I::ElemType>,
+}
+
+enum ChunkState<I: Dist> {
+    Pending(
+        OneSidedMemoryRegion<I>,
+        Box<dyn LamellarArrayRequest<Output = ()>>,
+    ),
+    Finished,
 }
 
 impl<I> Chunks<I>
@@ -23,49 +33,103 @@ where
 {
     pub(crate) fn new(iter: I, chunk_size: usize) -> Chunks<I> {
         // let array = iter.array().clone(); //.to_base::<u8>();
-        // println!("len: {:?}",array.len());
-        // let mem_region = iter.array().team().alloc_one_sided_mem_region(chunk_size);//*iter.array().size_of_elem());
+        // println!(" Chunks size: {:?}", chunk_size);
+
         let chunks = Chunks {
             iter,
-            // array,
-            // mem_region: mem_region.clone(),
             index: 0,
             chunk_size,
+            state: ChunkState::Finished,
         };
-        // chunks.fill_buffer(0,&mem_region);
         chunks
     }
 
-    fn get_buffer(&self, size: usize) -> OneSidedMemoryRegion<<I as OneSidedIterator>::ElemType> {
-        let mem_region: OneSidedMemoryRegion<<I as OneSidedIterator>::ElemType> =
-            self.array().team_rt().alloc_one_sided_mem_region(size);
+    fn get_buffer(
+        array: <I as OneSidedIteratorInner>::Array,
+        index: usize,
+        size: usize,
+    ) -> (
+        OneSidedMemoryRegion<I::ElemType>,
+        Box<dyn LamellarArrayRequest<Output = ()>>,
+    ) {
+        // println!(" get chunk of len: {:?}", size);
+        let mem_region: OneSidedMemoryRegion<I::ElemType> =
+            array.team_rt().alloc_one_sided_mem_region(size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
         // but safe with respect to the mem_region as this is the only reference
-        unsafe {
-            self.array().internal_get(self.index, &mem_region).wait();
-        }
-        mem_region
+        let req = unsafe { array.internal_get(index, &mem_region) };
+        (mem_region, req)
     }
 }
 
-impl<I> OneSidedIterator for Chunks<I>
+impl<I> OneSidedIterator for Chunks<I> where I: OneSidedIterator + Send {}
+
+impl<I> OneSidedIteratorInner for Chunks<I>
 where
     I: OneSidedIterator + Send,
 {
     type ElemType = I::ElemType;
     type Item = OneSidedMemoryRegion<I::ElemType>;
     type Array = I::Array;
+
+    fn init(&mut self) {
+        let array = self.array();
+        let size = std::cmp::min(self.chunk_size, array.len() - self.index);
+        let (new_mem_region, new_req) = Self::get_buffer(array, self.index, size);
+        self.state = ChunkState::Pending(new_mem_region, new_req);
+    }
     fn next(&mut self) -> Option<Self::Item> {
-        // println!("{:?} {:?}",self.index,self.array.len()/std::mem::size_of::<<Self as OneSidedIterator>::ElemType>());
         let array = self.array();
-        if self.index < array.len() {
-            let size = std::cmp::min(self.chunk_size, array.len() - self.index);
-
-            let mem_region = self.get_buffer(size);
-            self.index += size;
-            Some(mem_region)
-        } else {
-            None
+        let mut cur_state = ChunkState::Finished;
+        std::mem::swap(&mut self.state, &mut cur_state);
+        match cur_state {
+            ChunkState::Pending(mem_region, req) => {
+                if self.index + 1 < array.len() {
+                    //prefetch
+                    let size = std::cmp::min(self.chunk_size, array.len() - self.index);
+                    self.index += size;
+                    let (new_mem_region, new_req) = Self::get_buffer(array, self.index, size);
+                    self.state = ChunkState::Pending(new_mem_region, new_req);
+                } else {
+                    self.state = ChunkState::Finished;
+                }
+                req.wait();
+                Some(mem_region)
+            }
+            ChunkState::Finished => None,
+        }
+    }
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let array = self.array();
+        let this = self.as_mut().project();
+        let mut cur_state = ChunkState::Finished;
+
+        std::mem::swap(&mut *this.state, &mut cur_state);
+
+        match cur_state {
+            ChunkState::Pending(mem_region, mut req) => {
+                if !req.ready() {
+                    req.set_waker(cx.waker().clone());
+                    *this.state = ChunkState::Pending(mem_region, req);
+
+                    // println!("not ready");
+                    return Poll::Pending;
+                }
+                if *this.index + 1 < array.len() {
+                    // println!("got chunk! {:?}", *this.index);
+                    //prefetch
+                    let size = std::cmp::min(*this.chunk_size, array.len() - *this.index);
+                    *this.index += size;
+                    let (new_mem_region, new_req) = Self::get_buffer(array, *this.index, size);
+                    *this.state = ChunkState::Pending(new_mem_region, new_req);
+                } else {
+                    // println!("finished chunks!");
+                    *this.state = ChunkState::Finished;
+                }
+                Poll::Ready(Some(mem_region))
+            }
+            ChunkState::Finished => Poll::Ready(None),
         }
     }
 
@@ -74,6 +138,23 @@ where
         self.index += count * self.chunk_size;
     }
 
+    fn advance_index_pin(self: Pin<&mut Self>, count: usize) {
+        // println!(
+        //     "advance_index_pin {:?} {:?} {:?}",
+        //     self.index,
+        //     count,
+        //     count * self.chunk_size,
+        // );
+        let this = self.project();
+        *this.index += count * *this.chunk_size;
+        // println!(
+        //     "after advance_index_pin {:?} {:?} {:?} ",
+        //     *this.index,
+        //     count,
+        //     count * *this.chunk_size,
+        // );
+    }
+
     fn array(&self) -> Self::Array {
         self.iter.array()
     }
diff --git a/src/array/iterator/one_sided_iterator/skip.rs b/src/array/iterator/one_sided_iterator/skip.rs
index 6e26186d..735dd903 100644
--- a/src/array/iterator/one_sided_iterator/skip.rs
+++ b/src/array/iterator/one_sided_iterator/skip.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::one_sided_iterator::*;
+use crate::array::iterator::one_sided_iterator::{private::*, *};
 // use crate::array::LamellarArrayRequest;
 // use crate::memregion::OneSidedMemoryRegion;
 
@@ -21,21 +21,37 @@ where
     }
 }
 
-impl<I> OneSidedIterator for Skip<I>
+impl<I> OneSidedIterator for Skip<I> where I: OneSidedIterator + Send {}
+
+impl<I> OneSidedIteratorInner for Skip<I>
 where
     I: OneSidedIterator + Send,
 {
     type ElemType = I::ElemType;
-    type Item = <I as OneSidedIterator>::Item;
+    type Item = <I as OneSidedIteratorInner>::Item;
     type Array = I::Array;
+
+    fn init(&mut self) {
+        self.iter.init()
+    }
+
     fn next(&mut self) -> Option<Self::Item> {
         self.iter.next()
     }
 
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        self.project().iter.poll_next(cx)
+    }
+
     fn advance_index(&mut self, count: usize) {
         self.iter.advance_index(count);
     }
 
+    fn advance_index_pin(self: Pin<&mut Self>, count: usize) {
+        // println!("skipping {count}");
+        self.project().iter.advance_index_pin(count);
+    }
+
     fn array(&self) -> Self::Array {
         self.iter.array()
     }
diff --git a/src/array/iterator/one_sided_iterator/step_by.rs b/src/array/iterator/one_sided_iterator/step_by.rs
index cf929d59..fc616fb3 100644
--- a/src/array/iterator/one_sided_iterator/step_by.rs
+++ b/src/array/iterator/one_sided_iterator/step_by.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::one_sided_iterator::*;
+use crate::array::iterator::one_sided_iterator::{private::*, *};
 // use crate::array::LamellarArrayRequest;
 // use crate::memregion::OneSidedMemoryRegion;
 
@@ -21,21 +21,48 @@ where
     }
 }
 
-impl<I> OneSidedIterator for StepBy<I>
+impl<I> OneSidedIterator for StepBy<I> where I: OneSidedIterator + Send {}
+
+impl<I> OneSidedIteratorInner for StepBy<I>
 where
     I: OneSidedIterator + Send,
 {
     type ElemType = I::ElemType;
-    type Item = <I as OneSidedIterator>::Item;
+    type Item = <I as OneSidedIteratorInner>::Item;
     type Array = I::Array;
+
+    fn init(&mut self) {
+        self.iter.init()
+    }
+
     fn next(&mut self) -> Option<Self::Item> {
         let res = self.iter.next()?;
         self.iter.advance_index(self.step_size - 1);
         Some(res)
     }
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let mut this = self.project();
+        match this.iter.as_mut().poll_next(cx) {
+            Poll::Ready(Some(res)) => {
+                // println!("step by {:?}", *this.step_size);
+                this.iter.advance_index_pin(*this.step_size - 1);
+                Poll::Ready(Some(res))
+            }
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => {
+                // println!("step by pending");
+                Poll::Pending
+            }
+        }
+    }
     fn advance_index(&mut self, count: usize) {
         self.iter.advance_index(count * self.step_size);
     }
+    fn advance_index_pin(self: Pin<&mut Self>, count: usize) {
+        let step_size = self.step_size;
+        self.project().iter.advance_index_pin(count * step_size);
+    }
     fn array(&self) -> Self::Array {
         self.iter.array()
     }
diff --git a/src/array/iterator/one_sided_iterator/zip.rs b/src/array/iterator/one_sided_iterator/zip.rs
index 7d9cd089..014f2513 100644
--- a/src/array/iterator/one_sided_iterator/zip.rs
+++ b/src/array/iterator/one_sided_iterator/zip.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::one_sided_iterator::*;
+use crate::array::iterator::one_sided_iterator::{private::*, *};
 // use crate::array::LamellarArrayRequest;
 // use crate::memregion::OneSidedMemoryRegion;
 
@@ -26,11 +26,23 @@ use pin_project::pin_project;
 // }
 
 #[pin_project]
-pub struct Zip<A, B> {
+pub struct Zip<A, B>
+where
+    A: OneSidedIterator + Send,
+    B: OneSidedIterator + Send,
+{
     #[pin]
     a: A,
     #[pin]
     b: B,
+    state: ZipState<<A as OneSidedIteratorInner>::Item, <B as OneSidedIteratorInner>::Item>,
+}
+
+enum ZipState<A, B> {
+    Pending,
+    Finished,
+    AReady(A),
+    BReady(B),
 }
 
 impl<A, B> Zip<A, B>
@@ -39,27 +51,122 @@ where
     B: OneSidedIterator + Send,
 {
     pub(crate) fn new(a: A, b: B) -> Self {
-        Zip { a, b }
+        Zip {
+            a,
+            b,
+            state: ZipState::Pending,
+        }
     }
 }
 
 impl<A, B> OneSidedIterator for Zip<A, B>
+where
+    A: OneSidedIterator + Send,
+    B: OneSidedIterator + Send,
+{
+}
+
+impl<A, B> OneSidedIteratorInner for Zip<A, B>
 where
     A: OneSidedIterator + Send,
     B: OneSidedIterator + Send,
 {
     type ElemType = A::ElemType;
-    type Item = (<A as OneSidedIterator>::Item, <B as OneSidedIterator>::Item);
+    type Item = (
+        <A as OneSidedIteratorInner>::Item,
+        <B as OneSidedIteratorInner>::Item,
+    );
     type Array = A::Array;
+
+    fn init(&mut self) {
+        self.a.init();
+        self.b.init();
+    }
     fn next(&mut self) -> Option<Self::Item> {
         let a = self.a.next()?;
         let b = self.b.next()?;
         Some((a, b))
     }
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let mut cur_state = ZipState::Pending;
+        let this = self.project();
+        std::mem::swap(&mut *this.state, &mut cur_state);
+
+        match cur_state {
+            ZipState::Pending => {
+                let a = this.a.poll_next(cx);
+                let b = this.b.poll_next(cx);
+                match (a, b) {
+                    (Poll::Ready(a), Poll::Ready(b)) => {
+                        if a.is_none() || b.is_none() {
+                            *this.state = ZipState::Finished;
+                            return Poll::Ready(None);
+                        }
+                        *this.state = ZipState::Pending;
+                        Poll::Ready(Some((a.unwrap(), b.unwrap())))
+                    }
+                    (Poll::Ready(a), Poll::Pending) => match a {
+                        Some(a) => {
+                            *this.state = ZipState::AReady(a);
+                            Poll::Pending
+                        }
+                        None => {
+                            *this.state = ZipState::Finished;
+                            Poll::Ready(None)
+                        }
+                    },
+                    (Poll::Pending, Poll::Ready(b)) => match b {
+                        Some(b) => {
+                            *this.state = ZipState::BReady(b);
+                            Poll::Pending
+                        }
+                        None => {
+                            *this.state = ZipState::Finished;
+                            Poll::Ready(None)
+                        }
+                    },
+                    (Poll::Pending, Poll::Pending) => Poll::Pending,
+                }
+            }
+            ZipState::AReady(a) => match this.b.poll_next(cx) {
+                Poll::Ready(b) => match b {
+                    Some(b) => {
+                        *this.state = ZipState::Pending;
+                        Poll::Ready(Some((a, b)))
+                    }
+                    None => {
+                        *this.state = ZipState::Finished;
+                        Poll::Ready(None)
+                    }
+                },
+                Poll::Pending => Poll::Pending,
+            },
+            ZipState::BReady(b) => match this.a.poll_next(cx) {
+                Poll::Ready(a) => match a {
+                    Some(a) => {
+                        *this.state = ZipState::Pending;
+                        Poll::Ready(Some((a, b)))
+                    }
+                    None => {
+                        *this.state = ZipState::Finished;
+                        Poll::Ready(None)
+                    }
+                },
+                Poll::Pending => Poll::Pending,
+            },
+            ZipState::Finished => Poll::Ready(None),
+        }
+    }
     fn advance_index(&mut self, count: usize) {
         self.a.advance_index(count);
         self.b.advance_index(count);
     }
+    fn advance_index_pin(self: Pin<&mut Self>, count: usize) {
+        let this = self.project();
+        this.a.advance_index_pin(count);
+        this.b.advance_index_pin(count);
+    }
     fn array(&self) -> Self::Array {
         self.a.array()
     }
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index f08cc308..d7ba403d 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -527,6 +527,10 @@ impl<T: Dist> LocalLockArray<T> {
         // println!("readonly into_global_lock");
         self.array.into()
     }
+
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+        self.array.async_barrier()
+    }
 }
 
 impl<T: Dist + 'static> LocalLockArray<T> {
@@ -834,6 +838,12 @@ impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
     fn get(&self) -> Self::Output {
         self.req.get()
     }
+    fn ready(&self) -> bool {
+        self.req.ready()
+    }
+    fn set_waker(&mut self, waker: futures::task::Waker) {
+        self.req.set_waker(waker)
+    }
 }
 
 impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 9fc0e785..ba268192 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -989,6 +989,10 @@ impl<T: Dist> NativeAtomicArray<T> {
         // println!("native into_read_only");
         self.array.into()
     }
+
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_  {
+        self.array.async_barrier()
+    }
 }
 
 impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
diff --git a/src/array/operations.rs b/src/array/operations.rs
index da6df929..ebf33677 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -1,13 +1,13 @@
-use crate::active_messaging::LamellarArcAm;
+// use crate::active_messaging::LamellarArcAm;
 use crate::array::atomic::*;
 use crate::array::generic_atomic::*;
 use crate::array::global_lock_atomic::*;
 use crate::array::local_lock_atomic::*;
 use crate::array::native_atomic::*;
-use crate::array::{AmDist, Dist, LamellarArrayRequest, LamellarEnv, LamellarWriteArray};
-use crate::lamellar_request::LamellarRequest;
-use crate::scheduler::Scheduler;
-use crate::LamellarTeamRT;
+use crate::array::{AmDist, Dist, LamellarEnv, LamellarWriteArray};
+// use crate::lamellar_request::LamellarRequest;
+// use crate::scheduler::Scheduler;
+// use crate::LamellarTeamRT;
 
 pub(crate) mod access;
 pub use access::{AccessOps, LocalAtomicOps};
@@ -24,13 +24,13 @@ pub use read_only::ReadOnlyOps;
 pub(crate) mod shift;
 pub use shift::{ElementShiftOps, LocalShiftOps, ShiftOps};
 
-use async_trait::async_trait;
-use parking_lot::Mutex;
-use std::collections::HashMap;
-use std::marker::PhantomData;
-use std::pin::Pin;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
+// use async_trait::async_trait;
+// use parking_lot::Mutex;
+// use std::collections::HashMap;
+// use std::marker::PhantomData;
+// use std::pin::Pin;
+// use std::sync::atomic::{AtomicBool, Ordering};
+// use std::sync::Arc;
 use std::u8;
 
 #[doc(hidden)]
@@ -896,436 +896,483 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
     }
 }
 
-#[doc(hidden)]
-pub trait BufferOp: Sync + Send {
-    fn add_ops(
-        &self,
-        op: *const u8,
-        op_data: *const u8,
-        team: Pin<Arc<LamellarTeamRT>>,
-    ) -> (bool, Arc<AtomicBool>);
-    fn add_fetch_ops(
-        &self,
-        pe: usize,
-        op: *const u8,
-        op_data: *const u8,
-        req_ids: &Vec<usize>,
-        res_map: OpResults,
-        team: Pin<Arc<LamellarTeamRT>>,
-    ) -> (bool, Arc<AtomicBool>, Option<OpResultOffsets>);
-
-    fn into_arc_am(
-        &self,
-        pe: usize,
-        sub_array: std::ops::Range<usize>,
-    ) -> (
-        Vec<LamellarArcAm>,
-        usize,
-        Arc<AtomicBool>,
-        Arc<Mutex<Vec<u8>>>,
-    );
-}
+// #[doc(hidden)]
+// pub trait BufferOp: Sync + Send {
+//     fn add_ops(
+//         &self,
+//         op: *const u8,
+//         op_data: *const u8,
+//         team: Pin<Arc<LamellarTeamRT>>,
+//     ) -> (bool, Arc<AtomicBool>);
+//     fn add_fetch_ops(
+//         &self,
+//         pe: usize,
+//         op: *const u8,
+//         op_data: *const u8,
+//         req_ids: &Vec<usize>,
+//         res_map: OpResults,
+//         team: Pin<Arc<LamellarTeamRT>>,
+//     ) -> (bool, Arc<AtomicBool>, Option<OpResultOffsets>);
+
+//     fn into_arc_am(
+//         &self,
+//         pe: usize,
+//         sub_array: std::ops::Range<usize>,
+//     ) -> (
+//         Vec<LamellarArcAm>,
+//         usize,
+//         Arc<AtomicBool>,
+//         Arc<Mutex<Vec<u8>>>,
+//     );
+// }
 
-#[doc(hidden)]
-pub type OpResultOffsets = Vec<(usize, usize, usize)>; //reqid,offset,len
+// #[doc(hidden)]
+// pub type OpResultOffsets = Vec<(usize, usize, usize)>; //reqid,offset,len
 
-#[doc(hidden)]
-pub struct OpReqOffsets(Arc<Mutex<HashMap<usize, OpResultOffsets>>>); //pe
-impl OpReqOffsets {
-    //#[tracing::instrument(skip_all)]
-    // pub(crate) fn new() -> Self {
-    //     OpReqOffsets(Arc::new(Mutex::new(HashMap::new())))
-    // }
-    //#[tracing::instrument(skip_all)]
-    pub fn insert(&self, index: usize, indices: OpResultOffsets) {
-        let mut map = self.0.lock();
-        map.insert(index, indices);
-    }
-    //#[tracing::instrument(skip_all)]
-    pub(crate) fn lock(&self) -> parking_lot::MutexGuard<HashMap<usize, OpResultOffsets>> {
-        self.0.lock()
-    }
-}
+// #[doc(hidden)]
+// pub struct OpReqOffsets(Arc<Mutex<HashMap<usize, OpResultOffsets>>>); //pe
+// impl OpReqOffsets {
+//     //#[tracing::instrument(skip_all)]
+//     // pub(crate) fn new() -> Self {
+//     //     OpReqOffsets(Arc::new(Mutex::new(HashMap::new())))
+//     // }
+//     //#[tracing::instrument(skip_all)]
+//     pub fn insert(&self, index: usize, indices: OpResultOffsets) {
+//         let mut map = self.0.lock();
+//         map.insert(index, indices);
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     pub(crate) fn lock(&self) -> parking_lot::MutexGuard<HashMap<usize, OpResultOffsets>> {
+//         self.0.lock()
+//     }
+// }
 
-impl Clone for OpReqOffsets {
-    fn clone(&self) -> Self {
-        OpReqOffsets(self.0.clone())
-    }
-}
+// impl Clone for OpReqOffsets {
+//     fn clone(&self) -> Self {
+//         OpReqOffsets(self.0.clone())
+//     }
+// }
 
-impl std::fmt::Debug for OpReqOffsets {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let map = self.0.lock();
-        write!(f, "{:?} {:?}", map.len(), map)
-    }
-}
+// impl std::fmt::Debug for OpReqOffsets {
+//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+//         let map = self.0.lock();
+//         write!(f, "{:?} {:?}", map.len(), map)
+//     }
+// }
 
-#[doc(hidden)]
-pub type PeOpResults = Arc<Mutex<Vec<u8>>>;
+// #[doc(hidden)]
+// pub type PeOpResults = Arc<Mutex<Vec<u8>>>;
 
-#[doc(hidden)]
-pub struct OpResults(Arc<Mutex<HashMap<usize, PeOpResults>>>);
-impl OpResults {
-    //#[tracing::instrument(skip_all)]
-    // pub(crate) fn new() -> Self {
-    //     OpResults(Arc::new(Mutex::new(HashMap::new())))
-    // }
-    //#[tracing::instrument(skip_all)]
-    pub fn insert(&self, index: usize, val: PeOpResults) {
-        let mut map = self.0.lock();
-        map.insert(index, val);
-    }
-    //#[tracing::instrument(skip_all)]
-    pub(crate) fn lock(&self) -> parking_lot::MutexGuard<HashMap<usize, PeOpResults>> {
-        self.0.lock()
-    }
-}
+// #[doc(hidden)]
+// pub struct OpResults(Arc<Mutex<HashMap<usize, PeOpResults>>>);
+// impl OpResults {
+//     //#[tracing::instrument(skip_all)]
+//     // pub(crate) fn new() -> Self {
+//     //     OpResults(Arc::new(Mutex::new(HashMap::new())))
+//     // }
+//     //#[tracing::instrument(skip_all)]
+//     pub fn insert(&self, index: usize, val: PeOpResults) {
+//         let mut map = self.0.lock();
+//         map.insert(index, val);
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     pub(crate) fn lock(&self) -> parking_lot::MutexGuard<HashMap<usize, PeOpResults>> {
+//         self.0.lock()
+//     }
+// }
 
-impl Clone for OpResults {
-    fn clone(&self) -> Self {
-        OpResults(self.0.clone())
-    }
-}
+// impl Clone for OpResults {
+//     fn clone(&self) -> Self {
+//         OpResults(self.0.clone())
+//     }
+// }
 
-impl std::fmt::Debug for OpResults {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let map = self.0.lock();
-        write!(f, "{:?} {:?}", map.len(), map)
-    }
-}
+// impl std::fmt::Debug for OpResults {
+//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+//         let map = self.0.lock();
+//         write!(f, "{:?} {:?}", map.len(), map)
+//     }
+// }
 
-pub(crate) struct ArrayOpHandle {
-    pub(crate) reqs: Vec<Box<ArrayOpHandleInner>>,
-}
+// pub(crate) struct ArrayOpHandle {
+//     pub(crate) reqs: Vec<Box<ArrayOpHandleInner>>,
+// }
 
-#[derive(Debug)]
-pub(crate) struct ArrayOpHandleInner {
-    pub(crate) complete: Vec<Arc<AtomicBool>>,
-    pub(crate) scheduler: Arc<Scheduler>,
-}
+// #[derive(Debug)]
+// pub(crate) struct ArrayOpHandleInner {
+//     pub(crate) complete: Vec<Arc<AtomicBool>>,
+//     pub(crate) scheduler: Arc<Scheduler>,
+// }
 
-pub(crate) struct ArrayOpFetchHandle<T: Dist> {
-    pub(crate) req: Box<ArrayOpFetchHandleInner<T>>,
-}
+// pub(crate) struct ArrayOpFetchHandle<T: Dist> {
+//     pub(crate) req: Box<ArrayOpFetchHandleInner<T>>,
+// }
 
-pub(crate) struct ArrayOpBatchFetchHandle<T: Dist> {
-    pub(crate) reqs: Vec<Box<ArrayOpFetchHandleInner<T>>>,
-}
+// pub(crate) struct ArrayOpBatchFetchHandle<T: Dist> {
+//     pub(crate) reqs: Vec<Box<ArrayOpFetchHandleInner<T>>>,
+// }
 
-#[derive(Debug)]
-pub(crate) struct ArrayOpFetchHandleInner<T: Dist> {
-    pub(crate) indices: OpReqOffsets,
-    pub(crate) complete: Vec<Arc<AtomicBool>>,
-    pub(crate) results: OpResults,
-    pub(crate) req_cnt: usize,
-    pub(crate) scheduler: Arc<Scheduler>,
-    pub(crate) _phantom: PhantomData<T>,
-}
+// #[derive(Debug)]
+// pub(crate) struct ArrayOpFetchHandleInner<T: Dist> {
+//     pub(crate) indices: OpReqOffsets,
+//     pub(crate) complete: Vec<Arc<AtomicBool>>,
+//     pub(crate) results: OpResults,
+//     pub(crate) req_cnt: usize,
+//     pub(crate) scheduler: Arc<Scheduler>,
+//     pub(crate) _phantom: PhantomData<T>,
+// }
 
-pub(crate) struct ArrayOpResultHandle<T: Dist> {
-    pub(crate) req: Box<ArrayOpResultHandleInner<T>>,
-}
-pub(crate) struct ArrayOpBatchResultHandle<T: Dist> {
-    pub(crate) reqs: Vec<Box<ArrayOpResultHandleInner<T>>>,
-}
+// pub(crate) struct ArrayOpResultHandle<T: Dist> {
+//     pub(crate) req: Box<ArrayOpResultHandleInner<T>>,
+// }
+// pub(crate) struct ArrayOpBatchResultHandle<T: Dist> {
+//     pub(crate) reqs: Vec<Box<ArrayOpResultHandleInner<T>>>,
+// }
 
-#[derive(Debug)]
-pub(crate) struct ArrayOpResultHandleInner<T> {
-    pub(crate) indices: OpReqOffsets,
-    pub(crate) complete: Vec<Arc<AtomicBool>>,
-    pub(crate) results: OpResults,
-    pub(crate) req_cnt: usize,
-    pub(crate) scheduler: Arc<Scheduler>,
-    pub(crate) _phantom: PhantomData<T>,
-}
+// #[derive(Debug)]
+// pub(crate) struct ArrayOpResultHandleInner<T> {
+//     pub(crate) indices: OpReqOffsets,
+//     pub(crate) complete: Vec<Arc<AtomicBool>>,
+//     pub(crate) results: OpResults,
+//     pub(crate) req_cnt: usize,
+//     pub(crate) scheduler: Arc<Scheduler>,
+//     pub(crate) _phantom: PhantomData<T>,
+// }
 
-#[async_trait]
-impl LamellarRequest for ArrayOpHandle {
-    type Output = ();
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.into_future().await;
-        }
-        ()
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        for req in &self.reqs {
-            req.get();
-        }
-        ()
-    }
-}
+// #[async_trait]
+// impl LamellarRequest for ArrayOpHandle {
+//     type Output = ();
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         for req in self.reqs.drain(..) {
+//             req.into_future().await;
+//         }
+//         ()
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         for req in &self.reqs {
+//             req.get();
+//         }
+//         ()
+//     }
+//     fn ready(&self) -> bool {
+//         self.reqs.iter().all(|req| req.ready())
+//     }
+//     fn set_waker(&mut self, waker: futures::task::Waker) {
+//         for req in &mut self.reqs {
+//             req.set_waker(waker.clone());
+//         }
+//     }
+// }
 
-#[async_trait]
-impl LamellarRequest for ArrayOpHandleInner {
-    type Output = ();
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for comp in self.complete {
-            while comp.load(Ordering::Relaxed) == false {
-                async_std::task::yield_now().await;
-            }
-        }
-        ()
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        for comp in &self.complete {
-            while comp.load(Ordering::Relaxed) == false {
-                // std::thread::yield_now();
-                self.scheduler.exec_task();
-            }
-        }
-        ()
-    }
-}
+// #[async_trait]
+// impl LamellarRequest for ArrayOpHandleInner {
+//     type Output = ();
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         for comp in self.complete {
+//             while comp.load(Ordering::Relaxed) == false {
+//                 async_std::task::yield_now().await;
+//             }
+//         }
+//         ()
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         for comp in &self.complete {
+//             while comp.load(Ordering::Relaxed) == false {
+//                 // std::thread::yield_now();
+//                 self.scheduler.exec_task();
+//             }
+//         }
+//         ()
+//     }
 
-#[async_trait]
-impl<T: Dist> LamellarRequest for ArrayOpFetchHandle<T> {
-    type Output = T;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        self.req
-            .into_future()
-            .await
-            .pop()
-            .expect("should have a single request")
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        self.req.get().pop().expect("should have a single request")
-    }
-}
+//     fn ready(&self) -> bool {
+//         self.complete
+//             .iter()
+//             .all(|comp| comp.load(Ordering::Relaxed))
+//     }
 
-#[async_trait]
-impl<T: Dist> LamellarRequest for ArrayOpBatchFetchHandle<T> {
-    type Output = Vec<T>;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        let mut res = vec![];
-        for req in self.reqs.drain(..) {
-            res.extend(req.into_future().await);
-        }
-        res
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        let mut res = vec![];
-        for req in &self.reqs {
-            res.extend(req.get());
-        }
-        // println!("res: {:?}",res);
-        res
-    }
-}
+//     fn set_waker(&mut self, waker: futures::task::Waker) {
+//         self.complete.iter()
+// }
 
-impl<T: Dist> ArrayOpFetchHandleInner<T> {
-    //#[tracing::instrument(skip_all)]
-    fn get_result(&self) -> Vec<T> {
-        if self.req_cnt > 0 {
-            let mut res_vec = Vec::with_capacity(self.req_cnt);
-            unsafe {
-                res_vec.set_len(self.req_cnt);
-            }
-            // println!("req_cnt: {:?}", self.req_cnt);
-
-            for (pe, res) in self.results.lock().iter() {
-                let res = res.lock();
-                for (rid, offset, len) in self.indices.lock().get(pe).unwrap().iter() {
-                    let len = *len;
-                    if len == std::mem::size_of::<T>() + 1 {
-                        panic!(
-                            "unexpected results len {:?} {:?}",
-                            len,
-                            std::mem::size_of::<T>() + 1
-                        );
-                    }
-                    let res_t = unsafe {
-                        std::slice::from_raw_parts(
-                            res.as_ptr().offset(*offset as isize) as *const T,
-                            len / std::mem::size_of::<T>(),
-                        )
-                    };
-                    // println!("rid {:?} offset {:?} len {:?} {:?}",rid,offset,len,res.len());
-                    // println!("res {:?} {:?}",res.len(),&res[offset..offset+len]);
-                    // println!("res {:?} {:?}",res_t,res_t.len());
-                    res_vec[*rid] = res_t[0];
-                }
-            }
-            res_vec
-        } else {
-            vec![]
-        }
-    }
-}
+// #[async_trait]
+// impl<T: Dist> LamellarRequest for ArrayOpFetchHandle<T> {
+//     type Output = T;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         self.req
+//             .into_future()
+//             .await
+//             .pop()
+//             .expect("should have a single request")
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         self.req.get().pop().expect("should have a single request")
+//     }
 
-#[async_trait]
-impl<T: Dist> LamellarRequest for ArrayOpFetchHandleInner<T> {
-    type Output = Vec<T>;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for comp in &self.complete {
-            while comp.load(Ordering::Relaxed) == false {
-                async_std::task::yield_now().await;
-            }
-        }
-        self.get_result()
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        for comp in &self.complete {
-            while comp.load(Ordering::Relaxed) == false {
-                // std::thread::yield_now();
-                self.scheduler.exec_task();
-            }
-        }
-        self.get_result()
-    }
-}
+//     fn ready(&self) -> bool {
+//         self.req.ready()
+//     }
+// }
 
-#[async_trait]
-impl<T: Dist> LamellarRequest for ArrayOpResultHandle<T> {
-    type Output = Result<T, T>;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        self.req
-            .into_future()
-            .await
-            .pop()
-            .expect("should have a single request")
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        self.req.get().pop().expect("should have a single request")
-    }
-}
+// #[async_trait]
+// impl<T: Dist> LamellarRequest for ArrayOpBatchFetchHandle<T> {
+//     type Output = Vec<T>;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         let mut res = vec![];
+//         for req in self.reqs.drain(..) {
+//             res.extend(req.into_future().await);
+//         }
+//         res
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         let mut res = vec![];
+//         for req in &self.reqs {
+//             res.extend(req.get());
+//         }
+//         // println!("res: {:?}",res);
+//         res
+//     }
 
-#[async_trait]
-impl<T: Dist> LamellarRequest for ArrayOpBatchResultHandle<T> {
-    type Output = Vec<Result<T, T>>;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        // println!("num_reqs: {}",self.reqs.len());
-        let mut res = vec![];
-        for req in self.reqs.drain(..) {
-            res.extend(req.into_future().await);
-        }
-        res
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        let mut res = vec![];
-        for req in &self.reqs {
-            res.extend(req.get());
-        }
-        res
-    }
-}
+//     fn ready(&self) -> bool {
+//         self.reqs.iter().all(|req| req.ready())
+//     }
+// }
 
-impl<T: Dist> ArrayOpResultHandleInner<T> {
-    //#[tracing::instrument(skip_all)]
-    fn get_result(&self) -> Vec<Result<T, T>> {
-        // println!("req_cnt: {:?}", self.req_cnt);
-        if self.req_cnt > 0 {
-            let mut res_vec = Vec::with_capacity(self.req_cnt);
-            unsafe {
-                res_vec.set_len(self.req_cnt);
-            }
+// impl<T: Dist> ArrayOpFetchHandleInner<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn get_result(&self) -> Vec<T> {
+//         if self.req_cnt > 0 {
+//             let mut res_vec = Vec::with_capacity(self.req_cnt);
+//             unsafe {
+//                 res_vec.set_len(self.req_cnt);
+//             }
+//             // println!("req_cnt: {:?}", self.req_cnt);
+
+//             for (pe, res) in self.results.lock().iter() {
+//                 let res = res.lock();
+//                 for (rid, offset, len) in self.indices.lock().get(pe).unwrap().iter() {
+//                     let len = *len;
+//                     if len == std::mem::size_of::<T>() + 1 {
+//                         panic!(
+//                             "unexpected results len {:?} {:?}",
+//                             len,
+//                             std::mem::size_of::<T>() + 1
+//                         );
+//                     }
+//                     let res_t = unsafe {
+//                         std::slice::from_raw_parts(
+//                             res.as_ptr().offset(*offset as isize) as *const T,
+//                             len / std::mem::size_of::<T>(),
+//                         )
+//                     };
+//                     // println!("rid {:?} offset {:?} len {:?} {:?}",rid,offset,len,res.len());
+//                     // println!("res {:?} {:?}",res.len(),&res[offset..offset+len]);
+//                     // println!("res {:?} {:?}",res_t,res_t.len());
+//                     res_vec[*rid] = res_t[0];
+//                 }
+//             }
+//             res_vec
+//         } else {
+//             vec![]
+//         }
+//     }
+// }
 
-            for (pe, res) in self.results.lock().iter() {
-                let res = res.lock();
-                // println!("{pe} {:?}",res.len());
-                // let mut rids = std::collections::HashSet::new();
-                let res_offsets_lock = self.indices.lock();
-                let res_offsets = res_offsets_lock.get(pe).unwrap();
-                // println!("{pe} {:?} {:?}",res_offsets[0],res_offsets.last());
-                for (rid, offset, len) in res_offsets.iter() {
-                    // if rids.contains(rid){
-                    //     println!("uhhh ohhhhh not sure this should be possible {:?}",rid);
-                    // }
-                    // else{
-                    //     rids.insert(rid);
-                    // }
-                    let ok: bool;
-                    let mut offset = *offset;
-                    let mut len = *len;
-                    if len == std::mem::size_of::<T>() + 1 {
-                        ok = res[offset] == 0;
-                        offset += 1;
-                        len -= 1;
-                    } else {
-                        panic!(
-                            "unexpected results len {:?} {:?}",
-                            len,
-                            std::mem::size_of::<T>() + 1
-                        );
-                    };
-                    let res_t = unsafe {
-                        std::slice::from_raw_parts(
-                            res.as_ptr().offset(offset as isize) as *const T,
-                            len / std::mem::size_of::<T>(),
-                        )
-                    };
-
-                    if ok {
-                        res_vec[*rid] = Ok(res_t[0]);
-                    } else {
-                        res_vec[*rid] = Err(res_t[0]);
-                    }
-                }
-            }
-            res_vec
-        } else {
-            vec![]
-        }
-    }
-}
+// #[async_trait]
+// impl<T: Dist> LamellarRequest for ArrayOpFetchHandleInner<T> {
+//     type Output = Vec<T>;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         for comp in &self.complete {
+//             while comp.load(Ordering::Relaxed) == false {
+//                 async_std::task::yield_now().await;
+//             }
+//         }
+//         self.get_result()
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         for comp in &self.complete {
+//             while comp.load(Ordering::Relaxed) == false {
+//                 // std::thread::yield_now();
+//                 self.scheduler.exec_task();
+//             }
+//         }
+//         self.get_result()
+//     }
+//     fn ready(&self) -> bool {
+//         self.complete
+//             .iter()
+//             .all(|comp| comp.load(Ordering::Relaxed))
+//     }
+// }
 
-#[async_trait]
-impl<T: Dist> LamellarRequest for ArrayOpResultHandleInner<T> {
-    type Output = Vec<Result<T, T>>;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        // println!("comp size: {}",self.complete.len());
-        for comp in &self.complete {
-            while comp.load(Ordering::Relaxed) == false {
-                async_std::task::yield_now().await;
-            }
-        }
-        self.get_result()
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Self::Output {
-        for comp in &self.complete {
-            while comp.load(Ordering::Relaxed) == false {
-                // std::thread::yield_now();
-                self.scheduler.exec_task();
-            }
-        }
-        self.get_result()
-    }
-}
+// #[async_trait]
+// impl<T: Dist> LamellarRequest for ArrayOpResultHandle<T> {
+//     type Output = Result<T, T>;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         self.req
+//             .into_future()
+//             .await
+//             .pop()
+//             .expect("should have a single request")
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         self.req.get().pop().expect("should have a single request")
+//     }
+
+//     fn ready(&self) -> bool {
+//         self.req.ready()
+//     }
+// }
+
+// #[async_trait]
+// impl<T: Dist> LamellarRequest for ArrayOpBatchResultHandle<T> {
+//     type Output = Vec<Result<T, T>>;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         // println!("num_reqs: {}",self.reqs.len());
+//         let mut res = vec![];
+//         for req in self.reqs.drain(..) {
+//             res.extend(req.into_future().await);
+//         }
+//         res
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         let mut res = vec![];
+//         for req in &self.reqs {
+//             res.extend(req.get());
+//         }
+//         res
+//     }
+
+//     fn ready(&self) -> bool {
+//         self.reqs.iter().all(|req| req.ready())
+//     }
+// }
+
+// impl<T: Dist> ArrayOpResultHandleInner<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn get_result(&self) -> Vec<Result<T, T>> {
+//         // println!("req_cnt: {:?}", self.req_cnt);
+//         if self.req_cnt > 0 {
+//             let mut res_vec = Vec::with_capacity(self.req_cnt);
+//             unsafe {
+//                 res_vec.set_len(self.req_cnt);
+//             }
+
+//             for (pe, res) in self.results.lock().iter() {
+//                 let res = res.lock();
+//                 // println!("{pe} {:?}",res.len());
+//                 // let mut rids = std::collections::HashSet::new();
+//                 let res_offsets_lock = self.indices.lock();
+//                 let res_offsets = res_offsets_lock.get(pe).unwrap();
+//                 // println!("{pe} {:?} {:?}",res_offsets[0],res_offsets.last());
+//                 for (rid, offset, len) in res_offsets.iter() {
+//                     // if rids.contains(rid){
+//                     //     println!("uhhh ohhhhh not sure this should be possible {:?}",rid);
+//                     // }
+//                     // else{
+//                     //     rids.insert(rid);
+//                     // }
+//                     let ok: bool;
+//                     let mut offset = *offset;
+//                     let mut len = *len;
+//                     if len == std::mem::size_of::<T>() + 1 {
+//                         ok = res[offset] == 0;
+//                         offset += 1;
+//                         len -= 1;
+//                     } else {
+//                         panic!(
+//                             "unexpected results len {:?} {:?}",
+//                             len,
+//                             std::mem::size_of::<T>() + 1
+//                         );
+//                     };
+//                     let res_t = unsafe {
+//                         std::slice::from_raw_parts(
+//                             res.as_ptr().offset(offset as isize) as *const T,
+//                             len / std::mem::size_of::<T>(),
+//                         )
+//                     };
+
+//                     if ok {
+//                         res_vec[*rid] = Ok(res_t[0]);
+//                     } else {
+//                         res_vec[*rid] = Err(res_t[0]);
+//                     }
+//                 }
+//             }
+//             res_vec
+//         } else {
+//             vec![]
+//         }
+//     }
+// }
+
+// #[async_trait]
+// impl<T: Dist> LamellarRequest for ArrayOpResultHandleInner<T> {
+//     type Output = Vec<Result<T, T>>;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         // println!("comp size: {}",self.complete.len());
+//         for comp in &self.complete {
+//             while comp.load(Ordering::Relaxed) == false {
+//                 async_std::task::yield_now().await;
+//             }
+//         }
+//         self.get_result()
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn get(&self) -> Self::Output {
+//         for comp in &self.complete {
+//             while comp.load(Ordering::Relaxed) == false {
+//                 // std::thread::yield_now();
+//                 self.scheduler.exec_task();
+//             }
+//         }
+//         self.get_result()
+//     }
+
+//     fn ready(&self) -> bool {
+//         self.complete
+//             .iter()
+//             .all(|comp| comp.load(Ordering::Relaxed))
+//     }
+// }
 
 /// Supertrait specifying that array elements must be [Sized] and must be able to be used in remote operations [Dist].
 pub trait ElementOps: Dist + Sized {}
 impl<T> ElementOps for T where T: Dist {}
 
-#[doc(hidden)]
-pub struct LocalOpResult<T: Dist> {
-    val: T,
-}
+// #[doc(hidden)]
+// pub struct LocalOpResult<T: Dist> {
+//     val: T,
+// }
 
-#[async_trait]
-impl<T: Dist> LamellarArrayRequest for LocalOpResult<T> {
-    type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        self.val
-    }
-    fn wait(self: Box<Self>) -> Self::Output {
-        self.val
-    }
-}
+// #[async_trait]
+// impl<T: Dist> LamellarArrayRequest for LocalOpResult<T> {
+//     type Output = T;
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         self.val
+//     }
+//     fn wait(self: Box<Self>) -> Self::Output {
+//         self.val
+//     }
+//     fn ready(&self) -> bool {
+//         true
+//     }
+// }
 
 impl<T: ElementArithmeticOps> ArithmeticOps<T> for LamellarWriteArray<T> {}
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 51002516..3d1963fa 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -1,51 +1,50 @@
 mod iteration;
-mod rdma;
 mod local_chunks;
+mod rdma;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
 use crate::darc::DarcMode;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
-use std::any::TypeId;
 use std::sync::Arc;
 
-type BufFn = fn(ReadOnlyByteArrayWeak) -> Arc<dyn BufferOp>;
+// type BufFn = fn(ReadOnlyByteArrayWeak) -> Arc<dyn BufferOp>;
 
 // type MultiMultiFn = fn(ReadOnlyByteArray,ArrayOpCmd,Vec<u8>) -> LamellarArcAm;
 // type MultiSingleFn = fn(ReadOnlyByteArray,ArrayOpCmd,Vec<u8>,Vec<usize>) -> LamellarArcAm;
 
-lazy_static! {
-    pub(crate) static ref BUFOPS: HashMap<TypeId, BufFn> = {
-        let mut map = HashMap::new();
-        for op in crate::inventory::iter::<ReadOnlyArrayOpBuf> {
-            map.insert(op.id.clone(), op.op);
-        }
-        map
-    };
-
-    // pub(crate) static ref MULTIMULTIOPS: HashMap<TypeId, MultiMultiFn> = {
-    //     let mut map = HashMap::new();
-    //     for op in crate::inventory::iter::<ReadOnlyArrayMultiMultiOps> {
-    //         map.insert(op.id.clone(), op.op);
-    //     }
-    //     map
-    // };
-
-    // pub(crate) static ref MULTISINGLEOPS: HashMap<TypeId, MultiSingleFn> = {
-    //     let mut map = HashMap::new();
-    //     for op in crate::inventory::iter::<ReadOnlyArrayMultiSingleOps> {
-    //         map.insert(op.id.clone(), op.op);
-    //     }
-    //     map
-    // };
+// lazy_static! {
+// pub(crate) static ref BUFOPS: HashMap<TypeId, BufFn> = {
+//     let mut map = HashMap::new();
+//     for op in crate::inventory::iter::<ReadOnlyArrayOpBuf> {
+//         map.insert(op.id.clone(), op.op);
+//     }
+//     map
+// };
 
-}
+// pub(crate) static ref MULTIMULTIOPS: HashMap<TypeId, MultiMultiFn> = {
+//     let mut map = HashMap::new();
+//     for op in crate::inventory::iter::<ReadOnlyArrayMultiMultiOps> {
+//         map.insert(op.id.clone(), op.op);
+//     }
+//     map
+// };
 
-#[doc(hidden)]
-pub struct ReadOnlyArrayOpBuf {
-    pub id: TypeId,
-    pub op: BufFn,
-}
+// pub(crate) static ref MULTISINGLEOPS: HashMap<TypeId, MultiSingleFn> = {
+//     let mut map = HashMap::new();
+//     for op in crate::inventory::iter::<ReadOnlyArrayMultiSingleOps> {
+//         map.insert(op.id.clone(), op.op);
+//     }
+//     map
+// };
+
+// }
+
+// #[doc(hidden)]
+// pub struct ReadOnlyArrayOpBuf {
+//     pub id: TypeId,
+//     pub op: BufFn,
+// }
 
 // #[doc(hidden)]
 // pub struct ReadOnlyArrayMultiMultiOps {
@@ -59,7 +58,7 @@ pub struct ReadOnlyArrayOpBuf {
 //     pub op: MultiSingleFn,
 // }
 
-crate::inventory::collect!(ReadOnlyArrayOpBuf);
+// crate::inventory::collect!(ReadOnlyArrayOpBuf);
 // crate::inventory::collect!(ReadOnlyArrayMultiMultiOps);
 // crate::inventory::collect!(ReadOnlyArrayMultiSingleOps);
 
@@ -335,6 +334,10 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
         // println!("readonly into_global_lock");
         self.array.into()
     }
+
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+        self.array.async_barrier()
+    }
 }
 
 impl<T: Dist + 'static> ReadOnlyArray<T> {
@@ -565,6 +568,7 @@ impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
     fn barrier(&self) {
         self.array.barrier();
     }
+
     fn wait_all(&self) {
         self.array.wait_all()
         // println!("done in wait all {:?}",std::time::SystemTime::now());
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index f690a9fa..012fc87d 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1,8 +1,8 @@
 mod iteration;
 
+mod local_chunks;
 pub(crate) mod operations;
 mod rdma;
-mod local_chunks;
 
 use crate::active_messaging::*;
 // use crate::array::r#unsafe::operations::BUFOPS;
@@ -631,6 +631,14 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         // println!("readonly into_global_lock");
         self.into()
     }
+
+    pub(crate) fn tasking_barrier(&self) {
+        self.inner.data.team.tasking_barrier();
+    }
+
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+        self.inner.data.team.async_barrier()
+    }
 }
 
 impl<T: Dist + 'static> UnsafeArray<T> {
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 581d5a7f..d5b9833a 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -496,7 +496,7 @@ impl<T: Dist> UnsafeArray<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Performs a blocking (active message based) "Get" of the data in this array starting at the provided index into the specified buffer
+    /// Performs a blocking "Get" of the data in this array starting at the provided index into the specified buffer
     ///
     /// The length of the Get is dictated by the length of the buffer.
     ///
@@ -1044,7 +1044,12 @@ impl<T: Dist + 'static> LamellarAm for InitSmallGetAm<T> {
             .pes_for_range(self.index, self.buf.len())
             .into_iter()
         {
-            // println!("pe {:?}",pe);
+            // println!(
+            //     "InitSmallGetAm pe {:?} index {:?} len {:?}",
+            //     pe,
+            //     self.index,
+            //     self.buf.len()
+            // );
             let remote_am = UnsafeRemoteSmallGetAm {
                 array: self.array.clone().into(),
                 start_index: self.index,
@@ -1059,7 +1064,7 @@ impl<T: Dist + 'static> LamellarAm for InitSmallGetAm<T> {
                     let mut cur_index = 0;
                     for req in reqs.drain(..) {
                         let data = req.await;
-                        // println!("data recv {:?}",data.len());
+                        // println!("data recv {:?}", data.len());
                         u8_buf.put_slice(lamellar::current_pe, cur_index, &data);
                         cur_index += data.len();
                     }
@@ -1098,9 +1103,12 @@ impl LamellarAm for UnsafeRemoteSmallGetAm {
     //we cant directly do a put from the array in to the data buf
     //because we need to guarantee the put operation is atomic (maybe iput would work?)
     async fn exec(self) -> Vec<u8> {
-        // println!("in remotegetam {:?} {:?}",self.start_index,self.len);
+        // println!(
+        //     "in remotegetam index {:?} len {:?}",
+        //     self.start_index, self.len
+        // );
         // let _lock = self.array.lock.read();
-        unsafe {
+        let vals = unsafe {
             match self
                 .array
                 .local_elements_for_range(self.start_index, self.len)
@@ -1108,7 +1116,9 @@ impl LamellarAm for UnsafeRemoteSmallGetAm {
                 Some((elems, _)) => elems.to_vec(),
                 None => vec![],
             }
-        }
+        };
+        // println!("done remotegetam len {:?}", vals.len());
+        vals
     }
 }
 
diff --git a/src/barrier.rs b/src/barrier.rs
index 1ee005fc..d9f3d7a9 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -276,10 +276,10 @@ impl Barrier {
         } else {
             if let Ok(val) = std::env::var("LAMELLAR_BARRIER_WARNING") {
                 if val != "0" && val != "false" && val != "no" && val != "off" {
-                    println!("[LAMELLAR WARNING] You are calling barrier from within the main thread (aka from a synchronous context), this is experimental and may result in deadlock! Set LAMELLAR_BARRIER_WARNING=0 to disable this warning");
+                    println!("[LAMELLAR WARNING] You are calling barrier from within an async context, this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BARRIER_WARNING=0 to disable this warning");
                 }
             } else {
-                println!("[LAMELLAR WARNING] You are calling barrier from within a worker thread (aka within an async context), this is experimental and may result in deadlock! Set  LAMELLAR_BARRIER_WARNING=0 to disable this warning");
+                println!("[LAMELLAR WARNING] You are calling barrier from within an async context), this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BARRIER_WARNING=0 to disable this warning");
             }
             self.tasking_barrier()
         }
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 5c812980..2711c8a1 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -5,6 +5,7 @@ use crate::lamellar_arch::LamellarArchRT;
 use crate::memregion::one_sided::MemRegionHandleInner;
 use crate::scheduler::Scheduler;
 use async_trait::async_trait;
+use futures::task::Waker;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;
 
@@ -25,6 +26,8 @@ pub trait LamellarRequest: Sync + Send {
     type Output;
     async fn into_future(mut self: Box<Self>) -> Self::Output;
     fn get(&self) -> Self::Output;
+    fn ready(&self) -> bool;
+    fn set_waker(&mut self, waker: Waker);
 }
 
 #[doc(hidden)]
@@ -89,6 +92,7 @@ impl LamellarRequestResult {
 
 pub(crate) struct LamellarRequestHandleInner {
     pub(crate) ready: AtomicBool,
+    pub(crate) waker: Mutex<Option<Waker>>,
     pub(crate) data: Cell<Option<InternalResult>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
@@ -128,6 +132,9 @@ impl LamellarRequestAddResult for LamellarRequestHandleInner {
         // for a single request this is only called one time by a single runtime thread so use of the cell is safe
         self.data.set(Some(data));
         self.ready.store(true, Ordering::SeqCst);
+        if let Some(waker) = self.waker.lock().take() {
+            waker.wake();
+        }
     }
     //#[tracing::instrument(skip_all)]
     fn update_counters(&self) {
@@ -209,6 +216,14 @@ impl<T: AmDist> LamellarRequest for LamellarRequestHandle<T> {
         }
         self.process_result(self.inner.data.replace(None).expect("result should exist"))
     }
+
+    fn ready(&self) -> bool {
+        self.inner.ready.load(Ordering::SeqCst)
+    }
+
+    fn set_waker(&mut self, waker: Waker) {
+        *self.inner.waker.lock() = Some(waker);
+    }
 }
 
 #[derive(Debug)]
@@ -216,6 +231,7 @@ pub(crate) struct LamellarMultiRequestHandleInner {
     pub(crate) cnt: AtomicUsize,
     pub(crate) arch: Arc<LamellarArchRT>,
     pub(crate) data: Mutex<HashMap<usize, InternalResult>>,
+    pub(crate) waker: Mutex<Option<Waker>>,
     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
     pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
@@ -247,6 +263,11 @@ impl LamellarRequestAddResult for LamellarMultiRequestHandleInner {
         let pe = self.arch.team_pe(pe).expect("pe does not exist on team");
         self.data.lock().insert(pe, data);
         self.cnt.fetch_sub(1, Ordering::SeqCst);
+        if self.cnt.load(Ordering::SeqCst) == 0 {
+            if let Some(waker) = self.waker.lock().take() {
+                waker.wake();
+            }
+        }
     }
     //#[tracing::instrument(skip_all)]
     fn update_counters(&self) {
@@ -348,7 +369,7 @@ impl<T: AmDist> LamellarMultiRequest for LamellarMultiRequestHandle<T> {
 pub(crate) struct LamellarLocalRequestHandleInner {
     // pub(crate) ready: AtomicBool,
     pub(crate) ready: (Mutex<bool>, Condvar),
-    // pub(crate) ready_cv: Condvar,
+    pub(crate) waker: Mutex<Option<Waker>>,
     pub(crate) data: Cell<Option<LamellarAny>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
@@ -397,6 +418,9 @@ impl LamellarRequestAddResult for LamellarLocalRequestHandleInner {
         // self.ready.store(true, Ordering::SeqCst);
         *self.ready.0.lock() = true;
         self.ready.1.notify_one();
+        if let Some(waker) = self.waker.lock().take() {
+            waker.wake();
+        }
     }
     //#[tracing::instrument(skip_all)]
     fn update_counters(&self) {
@@ -446,4 +470,14 @@ impl<T: SyncSend + 'static> LamellarRequest for LamellarLocalRequestHandle<T> {
         }
         self.process_result(self.inner.data.replace(None).expect("result should exist"))
     }
+
+    fn ready(&self) -> bool {
+        let ready = *self.inner.ready.0.lock();
+        // println!("ready: {}", ready);
+        ready
+    }
+
+    fn set_waker(&mut self, waker: Waker) {
+        *self.inner.waker.lock() = Some(waker);
+    }
 }
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index dd6bf6f3..5190aa6a 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -20,12 +20,14 @@ use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::task::Waker;
 use std::time::Instant;
 
 #[derive(Debug)]
 pub(crate) struct TaskGroupRequestHandleInner {
     cnt: Arc<AtomicUsize>,
     data: Mutex<HashMap<usize, InternalResult>>, //<sub_id, result>
+    wakers: Mutex<HashMap<usize, Waker>>,
     team_outstanding_reqs: Arc<AtomicUsize>,
     world_outstanding_reqs: Arc<AtomicUsize>,
     tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
@@ -52,6 +54,9 @@ impl LamellarRequestAddResult for TaskGroupRequestHandleInner {
     }
     fn add_result(&self, _pe: usize, sub_id: usize, data: InternalResult) {
         self.data.lock().insert(sub_id, data);
+        if let Some(waker) = self.wakers.lock().remove(&sub_id) {
+            waker.wake();
+        }
     }
     fn update_counters(&self) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
@@ -128,6 +133,14 @@ impl<T: AmDist> LamellarRequest for TaskGroupRequestHandle<T> {
         }
         self.process_result(res.expect("result should exist"))
     }
+
+    fn ready(&self) -> bool {
+        self.inner.data.lock().contains_key(&self.sub_id)
+    }
+
+    fn set_waker(&mut self, waker: std::task::Waker) {
+        self.inner.wakers.lock().insert(self.sub_id, waker);
+    }
 }
 
 #[derive(Debug)]
@@ -135,6 +148,7 @@ pub(crate) struct TaskGroupMultiRequestHandleInner {
     cnt: Arc<AtomicUsize>,
     arch: Arc<LamellarArchRT>,
     data: Mutex<HashMap<usize, HashMap<usize, InternalResult>>>, //<sub_id, <pe, result>>
+    wakers: Mutex<HashMap<usize, Waker>>,
     team_outstanding_reqs: Arc<AtomicUsize>,
     world_outstanding_reqs: Arc<AtomicUsize>,
     tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
@@ -165,6 +179,10 @@ impl LamellarRequestAddResult for TaskGroupMultiRequestHandleInner {
         map.entry(sub_id)
             .or_insert_with(|| HashMap::new())
             .insert(pe, data);
+
+        if let Some(waker) = self.wakers.lock().remove(&sub_id) {
+            waker.wake();
+        }
     }
     fn update_counters(&self) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
@@ -286,6 +304,7 @@ impl<T: AmDist> LamellarMultiRequest for TaskGroupMultiRequestHandle<T> {
 pub(crate) struct TaskGroupLocalRequestHandleInner {
     cnt: Arc<AtomicUsize>,
     data: Mutex<HashMap<usize, LamellarAny>>, //<sub_id, result>
+    wakers: Mutex<HashMap<usize, Waker>>,
     team_outstanding_reqs: Arc<AtomicUsize>,
     world_outstanding_reqs: Arc<AtomicUsize>,
     tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
@@ -316,6 +335,9 @@ impl LamellarRequestAddResult for TaskGroupLocalRequestHandleInner {
             InternalResult::Remote(_, _) => panic!("unexpected result type"),
             InternalResult::Unit => self.data.lock().insert(sub_id, Box::new(()) as LamellarAny),
         };
+        if let Some(waker) = self.wakers.lock().remove(&sub_id) {
+            waker.wake();
+        }
     }
     fn update_counters(&self) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
@@ -358,6 +380,14 @@ impl<T: SyncSend + 'static> LamellarRequest for TaskGroupLocalRequestHandle<T> {
         }
         self.process_result(res.unwrap())
     }
+
+    fn ready(&self) -> bool {
+        self.inner.data.lock().contains_key(&self.sub_id)
+    }
+
+    fn set_waker(&mut self, waker: futures::task::Waker) {
+        self.inner.wakers.lock().insert(self.sub_id, waker);
+    }
 }
 
 /// An abstraction for representing a set of active messages as single group.
@@ -438,6 +468,10 @@ impl ActiveMessaging for LamellarTaskGroup {
         self.team.barrier();
     }
 
+    fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send {
+        self.team.async_barrier()
+    }
+
     //#[tracing::instrument(skip_all)]
     fn exec_am_all<F>(&self, am: F) -> Pin<Box<dyn Future<Output = Vec<F::Output>> + Send>>
     where
@@ -491,6 +525,7 @@ impl LamellarTaskGroup {
         let req = Arc::new(TaskGroupRequestHandleInner {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
+            wakers: Mutex::new(HashMap::new()),
             team_outstanding_reqs: team.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
@@ -501,6 +536,7 @@ impl LamellarTaskGroup {
             cnt: cnt.clone(),
             arch: team.arch.clone(),
             data: Mutex::new(HashMap::new()),
+            wakers: Mutex::new(HashMap::new()),
             team_outstanding_reqs: team.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
@@ -512,6 +548,7 @@ impl LamellarTaskGroup {
         let local_req = Arc::new(TaskGroupLocalRequestHandleInner {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
+            wakers: Mutex::new(HashMap::new()),
             team_outstanding_reqs: team.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index b2e30533..401e6f29 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -379,6 +379,12 @@ impl LamellarTeam {
         self.team.barrier()
     }
 
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+
+        self.team.async_barrier()
+    }
+
     #[doc(hidden)]
     pub fn exec_am_group_pe<F, O>(
         &self,
@@ -485,6 +491,12 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.barrier();
     }
 
+    fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+
+        self.team.async_barrier()
+    }
+
     fn block_on<F: Future>(&self, f: F) -> F::Output {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
@@ -1299,7 +1311,7 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    fn wait_all(&self) {
+    pub(crate) fn wait_all(&self) {
         let mut temp_now = Instant::now();
         while self.panic.load(Ordering::SeqCst) == 0
             && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
@@ -1382,6 +1394,7 @@ impl LamellarTeamRT {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             data: Mutex::new(HashMap::new()),
+            waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
@@ -1453,6 +1466,7 @@ impl LamellarTeamRT {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             data: Mutex::new(HashMap::new()),
+            waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
@@ -1535,6 +1549,7 @@ impl LamellarTeamRT {
         let req = Arc::new(LamellarRequestHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
+            waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
@@ -1608,6 +1623,7 @@ impl LamellarTeamRT {
         let req = Arc::new(LamellarRequestHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
+            waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
@@ -1680,6 +1696,7 @@ impl LamellarTeamRT {
         let req = Arc::new(LamellarMultiRequestHandleInner {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
+            waker: Mutex::new(None),
             data: Mutex::new(HashMap::new()),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
@@ -1750,6 +1767,7 @@ impl LamellarTeamRT {
         let req = Arc::new(LamellarRequestHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
+            waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
@@ -1886,6 +1904,7 @@ impl LamellarTeamRT {
         let req = Arc::new(LamellarLocalRequestHandleInner {
             ready: (Mutex::new(false), Condvar::new()),
             data: Cell::new(None),
+            waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index a8233733..eec53b60 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -75,6 +75,10 @@ impl ActiveMessaging for LamellarWorld {
         self.team.barrier();
     }
 
+    fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send {
+        self.team.async_barrier()
+    }
+
     fn block_on<F>(&self, f: F) -> F::Output
     where
         F: Future,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 72e422d5..5758d315 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -115,14 +115,14 @@ pub(crate) struct Scheduler {
 
 impl Scheduler {
     pub(crate) fn new(
-        executor: Executor,
+        executor: Arc<Executor>,
         active_message_engine: RegisteredActiveMessages,
         am_stall_mark: Arc<AtomicUsize>,
         status: Arc<AtomicU8>,
         panic: Arc<AtomicU8>,
     ) -> Self {
         Self {
-            executor: Arc::new(executor),
+            executor: executor,
             active_message_engine,
             num_ams: Arc::new(AtomicUsize::new(0)),
             max_ams: Arc::new(AtomicUsize::new(0)),
@@ -186,7 +186,6 @@ impl Scheduler {
         let num_ams = self.num_ams.clone();
         let max_ams = self.max_ams.clone();
         let ame = self.active_message_engine.clone();
-        let executor = self.executor.clone();
         let am_future = async move {
             num_ams.fetch_add(1, Ordering::Relaxed);
             let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
@@ -198,7 +197,7 @@ impl Scheduler {
             // );
             if let Some(header) = data.deserialize_header() {
                 let msg = header.msg;
-                ame.exec_msg(msg, data, lamellae, executor).await;
+                ame.exec_msg(msg, data, lamellae).await;
             } else {
                 data.print();
                 panic!("should i be here?");
@@ -265,17 +264,20 @@ impl Scheduler {
     }
 
     pub(crate) fn exec_task(&self) {
-        if std::thread::current().id() == *crate::MAIN_THREAD {
-            self.executor.exec_task();
-        } else {
-            std::thread::yield_now();
-        }
+        // if std::thread::current().id() == *crate::MAIN_THREAD {
+        self.executor.exec_task();
+        // } else {
+        //     std::thread::yield_now();
+        // }
     }
 
     pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             println!(
-                "trying to call block on within a worker thread {:?}",
+                "[LAMELLAR WARNING] trying to call block on within a worker thread {:?} this may result in deadlock.
+                Typically this means you are running within an async context. If you have something like:
+                world.block_on(my_future) you can simply change to my_future.await. If this is not the case,
+                please file an issue on github.",
                 std::backtrace::Backtrace::capture()
             )
         }
@@ -331,28 +333,40 @@ pub(crate) fn create_scheduler(
 ) -> Scheduler {
     let am_stall_mark = Arc::new(AtomicUsize::new(0));
     let status = Arc::new(AtomicU8::new(SchedulerStatus::Active as u8));
-    let executor = match executor {
+    let executor: Arc<Executor> = Arc::new(match executor {
         ExecutorType::LamellarWorkStealing => {
             WorkStealing::new(num_workers, status.clone(), panic.clone()).into()
         }
         #[cfg(feature = "tokio-executor")]
         ExecutorType::Tokio => TokioRt::new(num_workers).into(),
-    };
+    });
 
     let batcher = match std::env::var("LAMELLAR_BATCHER") {
         Ok(n) => {
             let n = n.parse::<usize>().unwrap();
             if n == 1 {
-                BatcherType::Simple(SimpleBatcher::new(num_pes, am_stall_mark.clone()))
+                BatcherType::Simple(SimpleBatcher::new(
+                    num_pes,
+                    am_stall_mark.clone(),
+                    executor.clone(),
+                ))
             } else {
-                BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone()))
+                BatcherType::TeamAm(TeamAmBatcher::new(
+                    num_pes,
+                    am_stall_mark.clone(),
+                    executor.clone(),
+                ))
             }
         }
-        Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(num_pes, am_stall_mark.clone())),
+        Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(
+            num_pes,
+            am_stall_mark.clone(),
+            executor.clone(),
+        )),
     };
     Scheduler::new(
-        executor,
-        RegisteredActiveMessages::new(batcher),
+        executor.clone(),
+        RegisteredActiveMessages::new(batcher, executor),
         am_stall_mark,
         status,
         panic,
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index 9dbf1204..a5a6738c 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -155,7 +155,7 @@ impl LamellarExecutor for WorkStealing {
             .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
             .spawn(move |_task_id| async move { task.await }, schedule);
 
-        runnable.schedule(); //try to run immediately
+        runnable.run(); //try to run immediately
         task.detach();
         // });
     }

From e711b9cac22cd433dad0c0ee2d378cadb199d4c1 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 18 Mar 2024 20:48:12 -0700
Subject: [PATCH 016/116] 1. replace Pin<Box<Future=...>> with explicit handle
 structs that implement Future. 2. Implement Stream for lamellar iterators. 3.
 Create explicit blocking and async mmethods for various functions where
 appropriate. 4. Expose LocalLockArray and GlobalLockArray read_lock and
 write_lock methods. 5. Update LocalLockArray and GlobalLockArray to work with
 changes to async vs blocking functions.

---
 Cargo.toml                                    |  19 +-
 examples/active_message_examples/am_local.rs  |   8 +-
 .../active_message_examples/recursive_am.rs   |   2 +-
 .../array_consumer_schedules.rs               |   3 +-
 examples/array_examples/onesided_iteration.rs |   5 +-
 examples/kernels/am_flops.rs                  |   2 +-
 examples/kernels/am_gemm.rs                   |   2 +-
 examples/kernels/cached_am_gemm.rs            |   2 +-
 examples/kernels/dft_proxy.rs                 |  14 +-
 .../kernels/parallel_blocked_array_gemm.rs    |  23 +-
 .../safe_parallel_blocked_array_gemm.rs       | 207 ++--
 impl/src/array_ops.rs                         |   6 +-
 impl/src/array_reduce.rs                      |   4 +-
 impl/src/gen_am_group.rs                      |   2 +-
 lamellar_run.sh                               |   2 +-
 run_examples.sh                               |  17 +-
 src/active_messaging.rs                       |  53 +-
 src/active_messaging/handle.rs                | 428 ++++++++
 src/active_messaging/prelude.rs               |   5 +-
 src/array.rs                                  | 563 +++++-----
 src/array/atomic.rs                           |  36 +-
 src/array/atomic/iteration.rs                 | 160 +--
 src/array/atomic/rdma.rs                      |   6 +-
 src/array/generic_atomic.rs                   |  28 +-
 src/array/generic_atomic/iteration.rs         |  78 +-
 src/array/generic_atomic/rdma.rs              |  48 +-
 src/array/global_lock_atomic.rs               | 229 +++-
 src/array/global_lock_atomic/iteration.rs     |  80 +-
 src/array/global_lock_atomic/rdma.rs          |  54 +-
 src/array/handle.rs                           |  95 ++
 src/array/iterator/consumer.rs                |   9 +-
 src/array/iterator/distributed_iterator.rs    |  94 +-
 .../distributed_iterator/consumer/collect.rs  | 123 ++-
 .../distributed_iterator/consumer/count.rs    | 119 ++-
 .../distributed_iterator/consumer/for_each.rs |  60 +-
 .../distributed_iterator/consumer/reduce.rs   | 155 ++-
 .../distributed_iterator/consumer/sum.rs      | 120 ++-
 src/array/iterator/local_iterator.rs          | 111 +-
 .../local_iterator/consumer/collect.rs        | 155 +--
 .../iterator/local_iterator/consumer/count.rs |  74 +-
 .../local_iterator/consumer/for_each.rs       |  60 +-
 .../local_iterator/consumer/reduce.rs         | 124 ++-
 .../iterator/local_iterator/consumer/sum.rs   |  84 +-
 src/array/iterator/mod.rs                     |  16 +-
 src/array/iterator/one_sided_iterator.rs      | 157 ++-
 .../iterator/one_sided_iterator/buffered.rs   |  19 +-
 .../iterator/one_sided_iterator/chunks.rs     |  40 +-
 src/array/iterator/one_sided_iterator/skip.rs |   2 +-
 .../iterator/one_sided_iterator/step_by.rs    |   2 +-
 src/array/iterator/one_sided_iterator/zip.rs  |   6 +-
 src/array/local_lock_atomic.rs                | 195 +++-
 src/array/local_lock_atomic/iteration.rs      |  80 +-
 src/array/local_lock_atomic/local_chunks.rs   |   6 +-
 src/array/local_lock_atomic/rdma.rs           |  48 +-
 src/array/native_atomic.rs                    |  28 +-
 src/array/native_atomic/iteration.rs          |  79 +-
 src/array/native_atomic/rdma.rs               |  48 +-
 src/array/operations.rs                       |  54 +-
 src/array/operations/arithmetic.rs            |   1 +
 src/array/operations/read_only.rs             |   2 +-
 src/array/prelude.rs                          |  15 +-
 src/array/read_only.rs                        |  26 +-
 src/array/read_only/iteration.rs              |  81 +-
 src/array/read_only/rdma.rs                   |   8 +-
 src/array/unsafe.rs                           |  30 +-
 src/array/unsafe/iteration/consumer.rs        |  55 +-
 src/array/unsafe/iteration/distributed.rs     |  42 +-
 src/array/unsafe/iteration/local.rs           |  96 +-
 src/array/unsafe/operations.rs                | 136 ++-
 src/array/unsafe/rdma.rs                      |  74 +-
 src/darc.rs                                   | 149 ++-
 src/darc/global_rw_darc.rs                    |  12 +-
 src/lamellae/rofi_lamellae.rs                 |   4 +-
 src/lamellae/shmem_lamellae.rs                |   4 +-
 src/lamellar_request.rs                       | 990 ++++++++++--------
 src/lamellar_task_group.rs                    | 425 ++++----
 src/lamellar_team.rs                          | 154 +--
 src/lamellar_world.rs                         |  12 +-
 src/lib.rs                                    |   3 +-
 src/scheduler.rs                              |   2 +-
 src/scheduler/tokio_executor.rs               |  10 +-
 src/scheduler/work_stealing.rs                |  17 +-
 tests/array/arithmetic_ops/add_test.rs        |  27 +-
 tests/array/arithmetic_ops/div_test.rs        |   6 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |  49 +-
 tests/array/arithmetic_ops/fetch_div_test.rs  |  12 +-
 tests/array/arithmetic_ops/fetch_mul_test.rs  |  12 +-
 tests/array/arithmetic_ops/fetch_rem_test.rs  |  12 +-
 tests/array/arithmetic_ops/fetch_sub_test.rs  |  18 +-
 tests/array/arithmetic_ops/mul_test.rs        |   6 +-
 tests/array/arithmetic_ops/rem_test.rs        |   6 +-
 tests/array/arithmetic_ops/sub_test.rs        |  12 +-
 .../array/atomic_ops/compare_exchange_test.rs |  16 +-
 tests/array/atomic_ops/load_store_test.rs     |   6 +-
 tests/array/atomic_ops/swap_test.rs           |  12 +-
 tests/array/bitwise_ops/and_test.rs           |   6 +-
 tests/array/bitwise_ops/fetch_and_test.rs     |  12 +-
 tests/array/bitwise_ops/fetch_or_test.rs      |  12 +-
 tests/array/bitwise_ops/fetch_xor_test.rs     |  12 +-
 tests/array/bitwise_ops/or_test.rs            |   6 +-
 tests/array/bitwise_ops/xor_test.rs           |   6 +-
 tests/array/rdma/blocking_get_test.rs         |   6 +-
 tests/array/rdma/get_test.rs                  |   6 +-
 tests/array/rdma/put_test.rs                  |   6 +-
 104 files changed, 3914 insertions(+), 2949 deletions(-)
 create mode 100644 src/active_messaging/handle.rs
 create mode 100644 src/array/handle.rs

diff --git a/Cargo.toml b/Cargo.toml
index f75afd30..f521bbdc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,39 +18,34 @@ rofisys = { version ="0.3", optional = true }
 inventory = "0.3" 
 serde = { version = "1.0.147", features = ["derive"] }
 serde_bytes = "0.11.7"
+serde_with = "3.0.0"
 bincode = "1.3.3"
 anyhow = "1.0.66"
-futures = "0.3.25"
-futures-lite= "1.12.0"
+futures-util = "0.3.30"
+pin-project = "1.1.4"
+pin-weak = "1.1.0"
 lazy_static = "1.4.0"
 crossbeam = "0.8.2"
 rand = "0.8.5"
 parking_lot = {version = "0.12.1", features = ["arc_lock", "send_guard", "serde"] }
 indexmap = "1.9.1" #lamellar_alloc
 core_affinity = "0.5.10"
-#log = "0.4.19"
-#simple_logger = "4.0.0"
 async-task = "4.3.0"
 async-trait = "0.1.58"
 async-std = "1.12.0"
 async-recursion = "1.0.0"
-libc = { version = "0.2.137", optional = true }
+async-lock = "2.8.0"
 enum_dispatch = "0.3.8"
 memoffset = "0.7.1"
 shared_memory = "0.12.4"
-#raw_sync = "0.1.5"
 paste = "1.0.9"
 newtype_derive = "0.1.6"
 custom_derive = "0.1.7"
 glob = "0.3.0"
 thread_local = "1.1.4"
-pin-project = "1.1.4"
-#enum-as-inner = "0.5.1"
-#itertools = "0.10.5"
-serde_with = "3.0.0"
-pin-weak = "1.1.0"
-async-lock = "2.8.0"
 tokio = { version = "1.35.1", features = ["full"] , optional = true}
+libc = { version = "0.2.137", optional = true }
+
 
 
 [dev-dependencies]
diff --git a/examples/active_message_examples/am_local.rs b/examples/active_message_examples/am_local.rs
index 1a891dc6..c38e8b99 100644
--- a/examples/active_message_examples/am_local.rs
+++ b/examples/active_message_examples/am_local.rs
@@ -109,18 +109,18 @@ fn main() {
         println!("-----------------------------------");
         //     println!("---------------------------------------------------------------");
         //     println!("Testing local am no return");
-        //     let res = world.exec_am_pe(my_pe, am.clone()).get();
+        //     let res = world.exec_am_pe(my_pe, am.clone()).blocking_wait();
         //     assert_eq!(res, None);
         //     println!("no return result: {:?}", res);
         //     println!("-----------------------------------");
         //     println!("Testing remote am no return");
-        //     let res = world.exec_am_pe(num_pes - 1, am.clone()).get();
+        //     let res = world.exec_am_pe(num_pes - 1, am.clone()).blocking_wait();
         //     assert_eq!(res, None);
         //     println!("no return result: {:?}", res);
         //     println!("-----------------------------------");
         //     println!("Testing all am no return");
         //     println!("[{:?}] exec on all", my_pe);
-        //     let res = world.exec_am_all(am.clone()).get();
+        //     let res = world.exec_am_all(am.clone()).blocking_wait();
         //     assert!(res.iter().all(|x| x.is_none()));
         //     println!("no return result: {:?}", res);
         //     println!("---------------------------------------------------------------");
@@ -128,7 +128,7 @@ fn main() {
 
     // println!("---------------------------------------------------------------");
     // println!("Testing ring pattern am no return");
-    // let res = world.exec_am_pe((my_pe + 1) % num_pes, am.clone()).get();
+    // let res = world.exec_am_pe((my_pe + 1) % num_pes, am.clone()).blocking_wait();
     // assert_eq!(res, None);
     // println!("no return result: {:?}", res);
     // println!("-----------------------------------");
diff --git a/examples/active_message_examples/recursive_am.rs b/examples/active_message_examples/recursive_am.rs
index fc7c35b3..d6bdbc85 100644
--- a/examples/active_message_examples/recursive_am.rs
+++ b/examples/active_message_examples/recursive_am.rs
@@ -43,7 +43,7 @@ impl LamellarAM for RecursiveAM {
                     orig: self.orig,
                 },
             );
-            // let mut res = next.get().expect("error returning from am"); // this will cause deadlock
+            // let mut res = next.blocking_wait().expect("error returning from am"); // this will cause deadlock
             let mut res = next.await;
             res.push(hostname::get().unwrap().into_string().unwrap()); //append my host name to list returned from previous call
             res
diff --git a/examples/array_examples/array_consumer_schedules.rs b/examples/array_examples/array_consumer_schedules.rs
index 2ad821a7..ae94fa65 100644
--- a/examples/array_examples/array_consumer_schedules.rs
+++ b/examples/array_examples/array_consumer_schedules.rs
@@ -96,7 +96,8 @@ fn sum_with_schedule(
     let result = array.block_on(
         array
             .local_iter()
-            .filter(|e| e.load() % 2 == 0)
+            .map(|e| e.load() )
+            .filter(|e| e % 2 == 0)
             .sum_with_schedule(schedule),
     );
     array.barrier();
diff --git a/examples/array_examples/onesided_iteration.rs b/examples/array_examples/onesided_iteration.rs
index a572da4a..eb5efbc3 100644
--- a/examples/array_examples/onesided_iteration.rs
+++ b/examples/array_examples/onesided_iteration.rs
@@ -28,18 +28,19 @@ fn main() {
     // we do not currently provide a mutable one sided iterator.
 
     if my_pe == 0 {
+        println!("Here");
         for elem in block_array.onesided_iter().into_iter() {
             //we can convert from a oneside iterator into a rust iterator
             print!("{:?} ", elem);
         }
         println!("");
-
+        println!("Here2");
         for elem in cyclic_array.onesided_iter().into_iter() {
             print!("{:?} ", elem);
         }
         println!("");
     }
-
+    println!("Here3");
     println!("--------------------------------------------------------");
 
     // The lamellar array iterator used above is lazy, meaning that it only accesses and returns a value as its used,
diff --git a/examples/kernels/am_flops.rs b/examples/kernels/am_flops.rs
index 8425a1a1..7783f39b 100644
--- a/examples/kernels/am_flops.rs
+++ b/examples/kernels/am_flops.rs
@@ -150,7 +150,7 @@ fn main() {
         //     let cur_t = timer.elapsed().as_secs_f64();
         //     let tot_flop: usize = reqs
         //         .iter()
-        //         .map(|r| r.get().iter().map(|r| r.unwrap()).sum::<usize>())
+        //         .map(|r| r.blocking_wait().iter().map(|r| r.unwrap()).sum::<usize>())
         //         .sum();
         //     let task_granularity = ((cur_t * 24f64) / num_tasks as f64) * 1000.0f64;
         //     if my_pe == 0 {
diff --git a/examples/kernels/am_gemm.rs b/examples/kernels/am_gemm.rs
index e9298d65..84eececb 100644
--- a/examples/kernels/am_gemm.rs
+++ b/examples/kernels/am_gemm.rs
@@ -10,7 +10,7 @@
 /// matrices use row-wise distribution (i.e. all elements of a row are local to a pe,
 /// conversely this means elements of a column are distributed across pes)
 ///----------------------------------------------------------------------------------
-use futures::future;
+use futures_util::{Future,future};
 use lamellar::active_messaging::prelude::*;
 use lamellar::memregion::prelude::*;
 use lazy_static::lazy_static;
diff --git a/examples/kernels/cached_am_gemm.rs b/examples/kernels/cached_am_gemm.rs
index 4ad121c2..b0c9c7d5 100644
--- a/examples/kernels/cached_am_gemm.rs
+++ b/examples/kernels/cached_am_gemm.rs
@@ -255,7 +255,7 @@ fn main() {
                 tasks += 1;
             }
             // for req in reqs {
-            //     req.get();
+            //     req.blocking_wait();
             // }
         }
 
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index a54e8411..ac761824 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -1,5 +1,5 @@
-// use futures::FutureExt;
-use futures::StreamExt;
+// use futures_util::FutureExt;
+use futures_util::StreamExt;
 use lamellar::active_messaging::prelude::*;
 /// ------------Lamellar Bandwidth: DFT Proxy  -------------------------
 /// This example is inspired from peforming a naive DFT
@@ -187,7 +187,7 @@ fn dft_lamellar_am_group(
 
     let timer = Instant::now();
 
-    let mut pe_groups = futures::stream::FuturesOrdered::new();
+    let mut pe_groups = futures_util::stream::FuturesOrdered::new();
     for pe in 0..num_pes {
         let mut local_sum_group = typed_am_group!(LocalSumAM2, world);
         for k in 0..local_len {
@@ -246,7 +246,7 @@ fn dft_lamellar_am_group_static(
 
     let timer = Instant::now();
 
-    let mut pe_groups = futures::stream::FuturesOrdered::new();
+    let mut pe_groups = futures_util::stream::FuturesOrdered::new();
     for pe in 0..num_pes {
         let mut local_sum_group = typed_am_group!(LocalSumAM2Static, world);
         for k in 0..local_len {
@@ -773,7 +773,7 @@ fn main() {
             //     println!(
             //         "{:?} array sum: {:?} time: {:?}",
             //         my_pe,
-            //         full_spectrum_array.sum().get(),
+            //         full_spectrum_array.sum().blocking_wait(),
             //         time
             //     );
             // }
@@ -794,7 +794,7 @@ fn main() {
             //     println!(
             //         "{:?} array sum: {:?} time: {:?}",
             //         my_pe,
-            //         full_spectrum_array.sum().get(),
+            //         full_spectrum_array.sum().blocking_wait(),
             //         time
             //     );
             // }
@@ -842,7 +842,7 @@ fn main() {
             //     println!(
             //         "{:?} array sum: {:?} time: {:?}",
             //         my_pe,
-            //         full_spectrum_array.sum().get(),
+            //         full_spectrum_array.sum().blocking_wait(),
             //         time
             //     );
             // }
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index 416950ce..1c8382a2 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -1,4 +1,4 @@
-use futures::stream::StreamExt;
+use futures_util::stream::StreamExt;
 use lamellar::array::prelude::*;
 /// ----------------Lamellar Parallel Blocked Array GEMM---------------------------------------------------
 /// This performs a distributed GEMM by partitioning the global matrices (stored in LamellarArrya)
@@ -96,12 +96,10 @@ fn main() {
         let c = c_clone.clone();
         async move {
             //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
-            // for j_blk in 0..p_blks {
             // iterate over submatrix rows of b
             let j_blk = block.j;
             let k_blk = block.k;
             // println!("j_blk: {}, k_blk: {}", j_blk, k_blk);
-            // let b = b_clone.clone();
             let b_block = b
                 .onesided_iter() // OneSidedIterator (each pe will iterate through entirety of b)
                 .chunks(blocksize) //chunks columns by blocksize  -- manages efficent transfer and placement of data into a local memory region
@@ -125,7 +123,6 @@ fn main() {
 
             for i_blk in 0..m_blks_pe {
                 // iterate of the local submatrix rows of a
-                // let c = c_clone.clone();
                 let b_block_vec = b_block_vec.clone();
                 let a_vec = a
                     .local_as_slice()
@@ -136,22 +133,6 @@ fn main() {
                     .flatten()
                     .copied() //get values instead of references
                     .collect::<Vec<f32>>();
-                // a.dist_iter() //DistributedIterator (each pe will iterate through only its local data -- in parallel)
-                //     .chunks(blocksize) //chunks rows by blocksize
-                //     .skip(i_blk * m_blks * blocksize + k_blk) //skip previously visited submatrices
-                //     .step_by(m_blks) //grab chunk from the next row in submatrix
-                //     .take(blocksize) //we only need to take blocksize rows
-                //     .chunks(blocksize) //currently a "hack" for Iterate::collect()
-                //     .for_each(move |a_block| {
-                //         //iterate over local submatrices is submatrix row "i_blk"
-                //         //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
-                //         let mut a_vec = vec![0.0; blocksize * blocksize];
-                //         for (i, row) in a_block.enumerate() {
-                //             for (j, elem) in row.enumerate() {
-                //                 a_vec[i * blocksize + j] = *elem;
-                //             }
-                //         }
-                // println!("a_vec: {:?}", a_vec);
                 // -------------------------------
                 let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
                 unsafe {
@@ -174,7 +155,6 @@ fn main() {
                 }
 
                 let c_slice = c.mut_local_data();
-                // let _lock = LOCK.lock();
 
                 for row in 0..blocksize {
                     let row_offset = (i_blk * blocksize + row) * n;
@@ -188,7 +168,6 @@ fn main() {
                         // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
                     }
                 }
-                //});
             }
         }
         // }
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index a5ed9544..6c7b6054 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -9,6 +9,7 @@ use lamellar::array::prelude::*;
 /// to the C matrix are only performed locally, requiring no additional data transfer.
 ///----------------------------------------------------------------------------------
 use matrixmultiply::sgemm;
+use futures_util::stream::StreamExt;
 
 fn main() {
     let args: Vec<String> = std::env::args().collect();
@@ -93,132 +94,92 @@ fn main() {
     let a = a.clone();
     let b = b.clone();
     let c_clone = c.clone();
-    let gemm = nblks_array.dist_iter().for_each(move |k_blk| {
-        // let a = a.clone();
-        // let b = b.clone();
+    let gemm = nblks_array.dist_iter().for_each_async(move |k_blk| {
+        let a = a.clone();
+        let b = b.clone();
         let c_clone = c_clone.clone();
-        // println!("[{:?}] kblk {k_blk}", my_pe);
-        // async move {
-        //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
-        let my_p_blks = (p_blks_pe * my_pe..p_blks).chain(0..p_blks_pe * my_pe); //start with the local block then proceed in round robin fashion (should hopefully help all PEs requesting data from the same PE at the same time)
-        for j_blk in my_p_blks {
-            // println!("\tjblk {j_blk}");
-            // iterate over submatrix rows of b
-
-            // let b_block =
-            // for (j, col) in b
-            let b_block: Vec<f32> = b
-                .onesided_iter() // OneSidedIterator (each pe will iterate through entirety of b)
-                .chunks(blocksize) //chunks columns by blocksize  -- manages efficent transfer and placement of data into a local memory region
-                .skip(*k_blk * n_blks * blocksize + j_blk) // skip previously transfered submatrices
-                .step_by(n_blks) //grab chunk from next column in submatrix
-                .into_iter() // convert to normal rust iterator
-                .take(blocksize) // we only need to take blocksize columns
-                .fold(Vec::new(), |mut vec, x| {
-                    vec.extend_from_slice(unsafe { x.as_slice().unwrap() });
-                    vec
-                });
-            //     .enumerate()
-            // {
-            //     for (i, elem) in col.as_slice().unwrap().iter().enumerate() {
-            //         b_block_vec[j * blocksize + i] = *elem
-            //     }
-            // }
-            // .collect::<Vec<_>>();
-            // .await; //gather local memory regions containing each columns data
-
-            //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
-
-            // for (j, col) in b_block.iter().enumerate() {
-            //     for (i, elem) in col.as_slice().unwrap().iter().enumerate() {
-            //         b_block_vec[j * blocksize + i] = *elem
-            //     }
-            // }
-            // println!("[{:?}] kblk {k_blk} jblk {j_blk}", my_pe);
-            // println!("b {b_block:?}");
-            // let b_block_vec = Arc::new(b_block_vec); //we will be sharing this submatrix in multiple tasks
-            //--------------
-            let a = a.clone();
-            let c_clone = c_clone.clone();
-            let _inner_gemm = m_blks_pe_array.local_iter().for_each_with_schedule(
-                Schedule::Chunk(m_blks_pe_array.len()),
-                move |i_blk| {
-                    // for i_blk in 0..m_blks_pe {
-                    // println!("\t\tiblk {i_blk}");
-                    // iterate of the local submatrix rows of a
-
-                    let c = c_clone.clone();
-                    let b_block_vec = b_block.clone();
-                    // a.dist_iter() //DistributedIterator (each pe will iterate through only its local data -- in parallel)
-                    // println!();
-                    let a_vec: Vec<f32> = a
-                        .local_as_slice()
-                        .chunks(blocksize) //chunks rows by blocksize
-                        .skip(i_blk * m_blks * blocksize + *k_blk) //skip previously visited submatrices
-                        .step_by(m_blks) //grab chunk from the next row in submatrix
-                        .take(blocksize) //we only need to take blocksize rows
-                        .fold(Vec::new(), |mut vec, x| {
-                            vec.extend(x);
+        let m_blks_pe_array = m_blks_pe_array.clone();
+        async move{
+            // println!("[{:?}] kblk {k_blk}", my_pe);
+            //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
+            let my_p_blks = (p_blks_pe * my_pe..p_blks).chain(0..p_blks_pe * my_pe); //start with the local block then proceed in round robin fashion (should hopefully help all PEs requesting data from the same PE at the same time)
+            for j_blk in my_p_blks {
+                // println!("\tjblk {j_blk}");
+                // iterate over submatrix rows of b
+
+                let b_block: Vec<f32> = b
+                    .onesided_iter() // OneSidedIterator (each pe will iterate through entirety of b)
+                    .chunks(blocksize) //chunks columns by blocksize  -- manages efficent transfer and placement of data into a local memory region
+                    .skip(*k_blk * n_blks * blocksize + j_blk) // skip previously transfered submatrices
+                    .step_by(n_blks) //grab chunk from next column in submatrix
+                    .into_stream() // convert to normal rust iterator
+                    .take(blocksize) // we only need to take blocksize columns
+                    .fold(Vec::new(), |mut vec, x| {
+                        vec.extend_from_slice(unsafe { x.as_slice().unwrap() });
+                        async move{
                             vec
-                        });
-                    // .collect::<Vec<_>>();
-                    // .collect::<Vec<_>>();
-                    // println!("a {a_vec:?}");
-                    // .iter()
-                    // .chunks(blocksize) //currently a "hack" for Iterate::collect()
-                    // .for_each(move |a_block| {
-                    //     println!("{a_block:?}");
-                    //iterate over local submatrices is submatrix row "i_blk"
-                    // //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
-                    // let mut a_vec = vec![0.0; blocksize * blocksize];
-                    // for (i, row) in a_block.enumerate() {
-                    //     for (j, elem) in row.enumerate() {
-                    //         a_vec[i * blocksize + j] = *elem;
-                    //     }
-                    // }
-                    // //-------------------------------
-                    let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
-                    unsafe {
-                        sgemm(
-                            blocksize,
-                            blocksize,
-                            blocksize,
-                            1.0,
-                            a_vec.as_ptr(),
-                            blocksize as isize,
-                            1,
-                            b_block_vec.as_ptr(),
-                            1,
-                            blocksize as isize,
-                            0.0,
-                            c_vec.as_mut_ptr(),
-                            blocksize as isize,
-                            1,
-                        );
-                    }
-
-                    let mut c_slice = c.blocking_write_local_data(); //this locks the array
-
-                    for row in 0..blocksize {
-                        let row_offset = (i_blk * blocksize + row) * n;
-                        for col in 0..blocksize {
-                            let col_offset = j_blk * blocksize + col;
-                            c_slice[row_offset + col_offset] += c_vec[row * blocksize + col];
-                            //we know all updates to c are local so directly update the raw data
-                            // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
-                            // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
                         }
-                    }
-                    // println!("[{:?}] kblk {k_blk} jblk {j_blk} iblk {i_blk}", my_pe);
-                    // });
-                },
-            );
-            c.block_on(_inner_gemm); //inner_gemm.await;
-                                     // println!(
-                                     //     "[{:?} {:?}] kblk {k_blk} jblk {j_blk} done",
-                                     //     my_pe,
-                                     //     std::thread::current().id()
-                                     // );
+                    }).await;
+                //--------------
+                let a = a.clone();
+                let c_clone = c_clone.clone();
+                let _inner_gemm = m_blks_pe_array.local_iter().for_each_async_with_schedule(
+                    Schedule::Chunk(m_blks_pe_array.len()),
+                    move |i_blk| {
+                        // println!("\t\tiblk {i_blk}");
+                        // iterate of the local submatrix rows of a
+
+                        let c = c_clone.clone();
+                        let b_block_vec = b_block.clone();
+                        let a_vec: Vec<f32> = a
+                            .local_as_slice()
+                            .chunks(blocksize) //chunks rows by blocksize
+                            .skip(i_blk * m_blks * blocksize + *k_blk) //skip previously visited submatrices
+                            .step_by(m_blks) //grab chunk from the next row in submatrix
+                            .take(blocksize) //we only need to take blocksize rows
+                            .fold(Vec::new(), |mut vec, x| {
+                                vec.extend(x);
+                                vec
+                            });
+                        
+                        let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
+                        unsafe {
+                            sgemm(
+                                blocksize,
+                                blocksize,
+                                blocksize,
+                                1.0,
+                                a_vec.as_ptr(),
+                                blocksize as isize,
+                                1,
+                                b_block_vec.as_ptr(),
+                                1,
+                                blocksize as isize,
+                                0.0,
+                                c_vec.as_mut_ptr(),
+                                blocksize as isize,
+                                1,
+                            );
+                        }
+                        async move {
+                            let mut c_slice = c.write_local_data().await; //this locks the array
+
+                            for row in 0..blocksize {
+                                let row_offset = (i_blk * blocksize + row) * n;
+                                for col in 0..blocksize {
+                                    let col_offset = j_blk * blocksize + col;
+                                    c_slice[row_offset + col_offset] += c_vec[row * blocksize + col];
+                                    //we know all updates to c are local so directly update the raw data
+                                    // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
+                                    // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
+                                }
+                            }
+                        }
+                        // println!("[{:?}] kblk {k_blk} jblk {j_blk} iblk {i_blk}", my_pe);
+                        // });
+                    },
+                );
+            }
         }
         // println!(
         //     "[{:?} {:?}] kblk {k_blk} done",
diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index f930d59b..c701473e 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -326,6 +326,7 @@ fn create_buf_ops(
             quote! { #val },                                    //lhs
             quote! {slice[index].store(val, Ordering::SeqCst)}, //assign
             quote! {
+                // println!("old value: {:?}, index: {:?}",slice[index].load(Ordering::SeqCst),index);
                 res.push(slice[index].fetch_add(val, Ordering::SeqCst));
             }, //fetch_add
             quote! {
@@ -552,7 +553,7 @@ fn create_buf_ops(
     } else if array_type == "LocalLockArray" {
         (
             quote! {}, //no explicit lock since the slice handle is a lock guard
-            quote! {let mut slice = self.data.write_local_data().await;}, //this is the lock
+            quote! {let mut slice = self.data.write_local_data().await; }, //this is the lock
         )
     } else if array_type == "GlobalLockArray" {
         (
@@ -928,6 +929,7 @@ fn create_buf_ops(
                     match self.index_size{
                         1 => {
                             let indices = unsafe {std::slice::from_raw_parts(self.indices.as_ptr() as *const u8, self.indices.len()/std::mem::size_of::<u8>())};
+                            // println!("Indices: {:?}",indices);
                             match self.op {
                                 #single_val_multi_idx_match_stmts
                             }
@@ -1277,12 +1279,14 @@ fn create_buf_ops(
         #[#am(AmGroup(false))]
         impl LamellarAM for #single_val_multi_idx_am_buf_fetch_name{ //eventually we can return fetchs here too...
             async fn exec(&self) -> Vec<#typeident>{
+                // println!("in single val multi idx exec");
                 #slice
                 let val = self.val;
                 let mut res = Vec::new();
                 match self.index_size{
                     1 => {
                         let indices = unsafe {std::slice::from_raw_parts(self.indices.as_ptr() as *const u8, self.indices.len()/std::mem::size_of::<u8>())};
+                        // println!("indices: {:?}", indices);
                         match self.op {
                             #single_val_multi_idx_fetch_match_stmts
                         }
diff --git a/impl/src/array_reduce.rs b/impl/src/array_reduce.rs
index ee7629e6..aecb6aa7 100644
--- a/impl/src/array_reduce.rs
+++ b/impl/src/array_reduce.rs
@@ -97,8 +97,8 @@ fn create_reduction(
                         let mid_pe = (self.start_pe + self.end_pe)/2;
                         let op = #op;
                         let timer = std::time::Instant::now();
-                        let left = __lamellar_team.exec_am_pe( self.start_pe,  #reduction_name { data: self.data.clone(), start_pe: self.start_pe, end_pe: mid_pe});//.into_future();
-                        let right = __lamellar_team.exec_am_pe( mid_pe+1, #reduction_name { data: self.data.clone(), start_pe: mid_pe+1, end_pe: self.end_pe});//.into_future();
+                        let left = __lamellar_team.exec_am_pe( self.start_pe,  #reduction_name { data: self.data.clone(), start_pe: self.start_pe, end_pe: mid_pe});//;
+                        let right = __lamellar_team.exec_am_pe( mid_pe+1, #reduction_name { data: self.data.clone(), start_pe: mid_pe+1, end_pe: self.end_pe});//;
                         let res = op(left.await,right.await);
 
                         // println!("[{:?}] {:?} {:?}",__lamellar_current_pe,res,timer.elapsed().as_secs_f64());
diff --git a/impl/src/gen_am_group.rs b/impl/src/gen_am_group.rs
index 479a11e1..c57ae4ad 100644
--- a/impl/src/gen_am_group.rs
+++ b/impl/src/gen_am_group.rs
@@ -373,7 +373,7 @@ fn impl_am_group_user(
                 }
 
                 // println!("{} pending reqs", self.pending_reqs.len());
-                let results = #lamellar::futures::future::join_all(self.pending_reqs.drain(..).map(|req| async { req.into_result().await })).await;
+                let results = #lamellar::futures_util::future::join_all(self.pending_reqs.drain(..).map(|req| async { req.into_result().await })).await;
                 let num_pes = self.team.num_pes();
                 #typed_am_group_result_type
             }
diff --git a/lamellar_run.sh b/lamellar_run.sh
index 21b28f31..1bb295f0 100755
--- a/lamellar_run.sh
+++ b/lamellar_run.sh
@@ -34,7 +34,7 @@ for pe in $(seq 0 $ENDPE); do
     echo "more threads ${E_CORE} than cores ${NPROC} "
     exit
   fi
-  LAMELLAE_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" & 
+  LAMELLAE_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out & 
   S_CORE=$(($E_CORE ))
   E_CORE=$(($S_CORE + $THREADS))
 done
diff --git a/run_examples.sh b/run_examples.sh
index c4fae52e..2147f82f 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -72,7 +72,7 @@ results_dir=${output_dir}/rofiverbs_lamellae/${local_results_dir}
 rm -r ${results_dir}
 
 rm -r rofiverbs_lamellae
-mkdir -p rofiverbs_lamellae
+# mkdir -p rofiverbs_lamellae
 mkdir -p ${results_dir}
 ln -s ${output_dir}/rofiverbs_lamellae rofiverbs_lamellae
 
@@ -94,6 +94,8 @@ for toolchain in stable; do #nightly; do
     cd ${mode}
 
     for dir in `ls $root/examples`; do
+    # for dir in kernels; do
+      # if [ $dir == "array_examples" ]; then
       mkdir -p $dir
       cd $dir
 
@@ -119,12 +121,15 @@ for toolchain in stable; do #nightly; do
         #   done
         fi
       cd ..
-      cur_tasks=`squeue -u frie869 | grep " R " | wc -l`
-      while [ $cur_tasks -gt 3 ]; do
-        cur_tasks=`squeue -u frie869 | grep " R " | wc -l`
+      sleep 2
+      cur_tasks=`squeue -u frie869 | wc -l`
+      running_tasks=`squeue -u frie869 | grep " R " | wc -l`
+      while [ $((cur_tasks+running_tasks)) -gt 6 ]; do
+        cur_tasks=`squeue -u frie869 | wc -l`
+        running_tasks=`squeue -u frie869 | grep " R " | wc -l`
         sleep 5
-      done
-      
+      done   
+      # fi   
     done
     cd ..
     wait
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 1ab9dfc9..fcf1d7a9 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -639,9 +639,9 @@ use crate::lamellar_request::{InternalResult, LamellarRequestResult};
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::NetMemRegionHandle;
 use crate::scheduler::{Executor, LamellarExecutor, ReqId};
-// use log::trace;
+
 use async_trait::async_trait;
-use futures::Future;
+use futures_util::Future;
 use parking_lot::Mutex;
 use std::collections::HashMap;
 use std::pin::Pin;
@@ -658,6 +658,9 @@ pub use registered_active_message::RegisteredAm;
 
 pub(crate) mod batching;
 
+pub(crate) mod handle;
+pub use handle::*;
+
 pub(crate) const BATCH_AM_SIZE: usize = 100_000;
 
 /// This macro is used to setup the attributed type so that it can be used within remote active messages.
@@ -940,6 +943,9 @@ impl AMCounters {
 
 /// The interface for launching, executing, and managing Lamellar Active Messages .
 pub trait ActiveMessaging {
+    type SinglePeAmHandle<R: AmDist>;
+    type MultiAmHandle<R: AmDist>;
+    type LocalAmHandle<L>;
     #[doc(alias("One-sided", "onesided"))]
     /// launch and execute an active message on every PE (including originating PE).
     ///
@@ -982,7 +988,7 @@ pub trait ActiveMessaging {
     ///     assert_eq!(i,results[i]);
     /// }
     ///```
-    fn exec_am_all<F>(&self, am: F) -> Pin<Box<dyn Future<Output = Vec<F::Output>> + Send>>
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist;
 
@@ -1026,7 +1032,7 @@ pub trait ActiveMessaging {
     /// let result = world.block_on(request); //block until am has executed
     /// assert_eq!(world.num_pes()-1,result);
     ///```
-    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist;
 
@@ -1073,7 +1079,7 @@ pub trait ActiveMessaging {
     /// let result = world.block_on(request); //block until am has executed
     /// assert_eq!(world.my_pe(),result);
     ///```
-    fn exec_am_local<F>(&self, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static;
 
@@ -1213,40 +1219,3 @@ pub(crate) trait ActiveMessageEngine {
         req.add_result(pe, req_id.sub_id, data);
     }
 }
-
-// #[derive(Debug)]
-// pub(crate) enum ActiveMessageEngineType {
-//     RegisteredActiveMessages(RegisteredActiveMessages),
-// }
-
-// #[async_trait]
-// impl ActiveMessageEngine for ActiveMessageEngineType {
-//     async fn process_msg(
-//         self,
-//         am: Am,
-//         executor: Arc<Executor>,
-//         stall_mark: usize,
-//         immediate: bool,
-//     ) {
-//         match self {
-//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-//                 remote_am
-//                     .process_msg(am, executor, stall_mark, immediate)
-//                     .await;
-//             }
-//         }
-//     }
-//     async fn exec_msg(
-//         self,
-//         msg: Msg,
-//         ser_data: SerializedData,
-//         lamellae: Arc<Lamellae>,
-//         executor: Arc<Executor>,
-//     ) {
-//         match self {
-//             ActiveMessageEngineType::RegisteredActiveMessages(remote_am) => {
-//                 remote_am.exec_msg(msg, ser_data, lamellae, executor).await;
-//             }
-//         }
-//     }
-// }
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
new file mode 100644
index 00000000..5468c652
--- /dev/null
+++ b/src/active_messaging/handle.rs
@@ -0,0 +1,428 @@
+use std::{
+    cell::Cell,
+    collections::HashMap,
+    pin::Pin,
+    sync::{
+        atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering},
+        Arc,
+    },
+    task::{Context, Poll, Waker},
+};
+
+use futures_util::Future;
+use parking_lot::Mutex;
+use pin_project::{pin_project, pinned_drop};
+
+use crate::{
+    lamellae::Des,
+    lamellar_request::{InternalResult, LamellarRequest, LamellarRequestAddResult},
+    memregion::one_sided::MemRegionHandleInner,
+    scheduler::Scheduler,
+    Darc, LamellarArchRT,
+};
+
+use super::{AmDist, DarcSerde, RemotePtr};
+
+pub(crate) struct AmHandleInner {
+    pub(crate) ready: AtomicBool,
+    pub(crate) waker: Mutex<Option<Waker>>,
+    pub(crate) data: Cell<Option<InternalResult>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
+    pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
+    pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
+    pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+    pub(crate) scheduler: Arc<Scheduler>,
+    pub(crate) user_handle: AtomicU8,
+}
+
+impl std::fmt::Debug for AmHandleInner {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "AmHandleInner {{ ready: {:?}, team_outstanding_reqs: {:?}  world_outstanding_reqs {:?} tg_outstanding_reqs {:?} user_handle{:?} }}", self.ready.load(Ordering::Relaxed),  self.team_outstanding_reqs.load(Ordering::Relaxed), self.world_outstanding_reqs.load(Ordering::Relaxed), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)), self.user_handle.load(Ordering::Relaxed))
+    }
+}
+
+// we use the ready bool to protect access to the data field
+unsafe impl Sync for AmHandleInner {}
+
+impl LamellarRequestAddResult for AmHandleInner {
+    fn user_held(&self) -> bool {
+        self.user_handle.load(Ordering::SeqCst) > 0
+    }
+    fn add_result(&self, _pe: usize, _sub_id: usize, data: InternalResult) {
+        // for a single request this is only called one time by a single runtime thread so use of the cell is safe
+        self.data.set(Some(data));
+        self.ready.store(true, Ordering::SeqCst);
+        if let Some(waker) = self.waker.lock().take() {
+            waker.wake();
+        }
+    }
+    fn update_counters(&self) {
+        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
+            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        }
+    }
+}
+
+#[derive(Debug)]
+#[pin_project(PinnedDrop)]
+pub struct AmHandle<T> {
+    pub(crate) inner: Arc<AmHandleInner>,
+    pub(crate) _phantom: std::marker::PhantomData<T>,
+}
+
+#[pinned_drop]
+impl<T> PinnedDrop for AmHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        self.inner.user_handle.fetch_sub(1, Ordering::SeqCst);
+    }
+}
+
+impl<T: AmDist> AmHandle<T> {
+    fn process_result(&self, data: InternalResult) -> T {
+        match data {
+            InternalResult::Local(x) => {
+                if let Ok(result) = x.downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected local result  of type ");
+                }
+            }
+            InternalResult::Remote(x, darcs) => {
+                if let Ok(result) = x.deserialize_data::<T>() {
+                    // we need to appropraiately set the reference counts if the returned data contains any Darcs
+                    // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
+                    // that the reference count is updated.
+                    for darc in darcs {
+                        match darc {
+                            RemotePtr::NetworkDarc(darc) => {
+                                let temp: Darc<()> = darc.into();
+                                temp.des(Ok(0));
+                                temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
+                            }
+                            RemotePtr::NetMemRegionHandle(mr) => {
+                                let temp: Arc<MemRegionHandleInner> = mr.into();
+                                temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
+                            }
+                        }
+                    }
+
+                    result
+                } else {
+                    panic!("unexpected remote result  of type ");
+                }
+            }
+            InternalResult::Unit => {
+                if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected unit result  of type ");
+                }
+            }
+        }
+    }
+}
+
+impl<T: AmDist> LamellarRequest for AmHandle<T> {
+    fn blocking_wait(self) -> T {
+        while !self.inner.ready.load(Ordering::SeqCst) {
+            self.inner.scheduler.exec_task();
+        }
+        self.process_result(self.inner.data.replace(None).expect("result should exist"))
+    }
+
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let mut cur_waker = self.inner.waker.lock();
+        if self.inner.ready.load(Ordering::SeqCst) {
+            true
+        } else {
+            match &mut *cur_waker {
+                Some(cur_waker) => {
+                    if !cur_waker.will_wake(waker) {
+                        println!("WARNING: overwriting waker {:?}", cur_waker);
+                        cur_waker.wake_by_ref();
+                    }
+                    cur_waker.clone_from(waker);
+                }
+                None => {
+                    *cur_waker = Some(waker.clone());
+                }
+            }
+            false
+        }
+    }
+
+    fn val(&self) -> Self::Output {
+        self.process_result(self.inner.data.replace(None).expect("result should exist"))
+    }
+}
+
+impl<T: AmDist> Future for AmHandle<T> {
+    type Output = T;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.as_mut();
+        if this.ready_or_set_waker(cx.waker()) {
+            Poll::Ready(
+                this.process_result(this.inner.data.replace(None).expect("result should exist")),
+            )
+        } else {
+            Poll::Pending
+        }
+    }
+}
+
+#[derive(Debug)]
+#[pin_project(PinnedDrop)]
+pub struct LocalAmHandle<T> {
+    pub(crate) inner: Arc<AmHandleInner>,
+    pub(crate) _phantom: std::marker::PhantomData<T>,
+}
+
+#[pinned_drop]
+impl<T> PinnedDrop for LocalAmHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        self.inner.user_handle.fetch_sub(1, Ordering::SeqCst);
+    }
+}
+
+impl<T: 'static> LocalAmHandle<T> {
+    fn process_result(&self, data: InternalResult) -> T {
+        match data {
+            InternalResult::Local(x) => {
+                if let Ok(result) = x.downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected local result  of type ");
+                }
+            }
+            InternalResult::Remote(_x, _darcs) => {
+                panic!("unexpected remote result  of type within local am handle");
+            }
+            InternalResult::Unit => {
+                if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected unit result  of type ");
+                }
+            }
+        }
+    }
+}
+
+impl<T: AmDist> From<LocalAmHandle<T>> for AmHandle<T> {
+    fn from(x: LocalAmHandle<T>) -> Self {
+        x.inner.user_handle.fetch_add(1, Ordering::SeqCst);
+        Self {
+            inner: x.inner.clone(),
+            _phantom: std::marker::PhantomData,
+        }
+    }
+}
+
+impl<T: 'static> LamellarRequest for LocalAmHandle<T> {
+    fn blocking_wait(self) -> T {
+        while !self.inner.ready.load(Ordering::SeqCst) {
+            self.inner.scheduler.exec_task();
+        }
+        let data = self.inner.data.replace(None).expect("result should exist");
+        self.process_result(data)
+    }
+
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let mut cur_waker = self.inner.waker.lock();
+        if self.inner.ready.load(Ordering::SeqCst) {
+            true
+        } else {
+            match &mut *cur_waker {
+                Some(cur_waker) => {
+                    if !cur_waker.will_wake(waker) {
+                        println!("WARNING: overwriting waker {:?}", cur_waker);
+                        cur_waker.wake_by_ref();
+                    }
+                    cur_waker.clone_from(waker);
+                }
+                None => {
+                    *cur_waker = Some(waker.clone());
+                }
+            }
+            false
+        }
+    }
+
+    fn val(&self) -> Self::Output {
+        let data = self.inner.data.replace(None).expect("result should exist");
+        self.process_result(data)
+    }
+}
+
+impl<T: 'static> Future for LocalAmHandle<T> {
+    type Output = T;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.as_mut();
+        if this.ready_or_set_waker(cx.waker()) {
+            Poll::Ready(
+                this.process_result(this.inner.data.replace(None).expect("result should exist")),
+            )
+        } else {
+            Poll::Pending
+        }
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct MultiAmHandleInner {
+    pub(crate) cnt: AtomicUsize,
+    pub(crate) arch: Arc<LamellarArchRT>,
+    pub(crate) data: Mutex<HashMap<usize, InternalResult>>,
+    pub(crate) waker: Mutex<Option<Waker>>,
+    pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
+    pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
+    pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+    pub(crate) scheduler: Arc<Scheduler>,
+    pub(crate) user_handle: AtomicU8, //we can use this flag to optimize what happens when the request returns
+}
+
+#[doc(hidden)]
+#[derive(Debug)]
+#[pin_project(PinnedDrop)]
+pub struct MultiAmHandle<T> {
+    pub(crate) inner: Arc<MultiAmHandleInner>,
+    pub(crate) _phantom: std::marker::PhantomData<T>,
+}
+
+#[pinned_drop]
+impl<T> PinnedDrop for MultiAmHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        self.inner.user_handle.fetch_sub(1, Ordering::SeqCst);
+    }
+}
+
+impl LamellarRequestAddResult for MultiAmHandleInner {
+    fn user_held(&self) -> bool {
+        self.user_handle.load(Ordering::SeqCst) > 0
+    }
+    fn add_result(&self, pe: usize, _sub_id: usize, data: InternalResult) {
+        let pe = self.arch.team_pe(pe).expect("pe does not exist on team");
+        self.data.lock().insert(pe, data);
+        self.cnt.fetch_sub(1, Ordering::SeqCst);
+        if self.cnt.load(Ordering::SeqCst) == 0 {
+            if let Some(waker) = self.waker.lock().take() {
+                waker.wake();
+            }
+        }
+    }
+    fn update_counters(&self) {
+        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
+            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        }
+    }
+}
+
+impl<T: AmDist> MultiAmHandle<T> {
+    fn process_result(&self, data: InternalResult) -> T {
+        match data {
+            InternalResult::Local(x) => {
+                if let Ok(result) = x.downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected local result  of type ");
+                }
+            }
+            InternalResult::Remote(x, darcs) => {
+                if let Ok(result) = x.deserialize_data::<T>() {
+                    // we need to appropraiately set the reference counts if the returned data contains any Darcs
+                    // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
+                    // that the reference count is updated.
+                    for darc in darcs {
+                        match darc {
+                            RemotePtr::NetworkDarc(darc) => {
+                                let temp: Darc<()> = darc.into();
+                                temp.des(Ok(0));
+                                temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
+                            }
+                            RemotePtr::NetMemRegionHandle(mr) => {
+                                let temp: Arc<MemRegionHandleInner> = mr.into();
+                                temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
+                            }
+                        }
+                    }
+                    result
+                } else {
+                    panic!("unexpected remote result  of type ");
+                }
+            }
+            InternalResult::Unit => {
+                if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected unit result  of type ");
+                }
+            }
+        }
+    }
+}
+
+impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
+    fn blocking_wait(self) -> Self::Output {
+        while self.inner.cnt.load(Ordering::SeqCst) > 0 {
+            self.inner.scheduler.exec_task();
+        }
+        let mut res = vec![];
+        let mut data = self.inner.data.lock();
+        // println!("data len{:?}", data.len());
+        for pe in 0..data.len() {
+            res.push(self.process_result(data.remove(&pe).expect("result should exist")));
+        }
+        res
+    }
+
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let mut cur_waker = self.inner.waker.lock();
+        if self.inner.cnt.load(Ordering::SeqCst) == 0 {
+            true
+        } else {
+            match &mut *cur_waker {
+                Some(cur_waker) => {
+                    if !cur_waker.will_wake(waker) {
+                        println!("WARNING: overwriting waker {:?}", cur_waker);
+                        cur_waker.wake_by_ref();
+                    }
+                    cur_waker.clone_from(waker);
+                }
+                None => {
+                    *cur_waker = Some(waker.clone());
+                }
+            }
+            false
+        }
+    }
+
+    fn val(&self) -> Self::Output {
+        let mut res = vec![];
+        let mut data = self.inner.data.lock();
+        for pe in 0..data.len() {
+            res.push(self.process_result(data.remove(&pe).expect("result should exist")));
+        }
+        res
+    }
+}
+
+impl<T: AmDist> Future for MultiAmHandle<T> {
+    type Output = Vec<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.as_mut();
+        if this.ready_or_set_waker(cx.waker()) {
+            let mut res = vec![];
+            let mut data = this.inner.data.lock();
+            // println!("data len{:?}", data.len());
+            for pe in 0..data.len() {
+                res.push(this.process_result(data.remove(&pe).expect("result should exist")));
+            }
+            Poll::Ready(res)
+        } else {
+            Poll::Pending
+        }
+    }
+}
diff --git a/src/active_messaging/prelude.rs b/src/active_messaging/prelude.rs
index 28d46901..daaf60d9 100644
--- a/src/active_messaging/prelude.rs
+++ b/src/active_messaging/prelude.rs
@@ -5,11 +5,12 @@
 // };
 // pub use crate::active_messaging::{ActiveMessaging, LamellarAM, LocalAM};
 pub use crate::active_messaging::{
-    am, local_am, typed_am_group, ActiveMessaging, AmData, AmGroupData, AmLocalData, LamellarSerde,
+    am, local_am, typed_am_group, ActiveMessaging, AmData, AmGroupData, AmHandle, AmLocalData,
+    LamellarSerde,
 };
 
 pub use crate::async_trait;
-pub use crate::futures::StreamExt;
+// pub use crate::futures_util::StreamExt;
 pub use crate::inventory;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
diff --git a/src/array.rs b/src/array.rs
index ab1c23a0..55a454b0 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -64,7 +64,6 @@
 //! let vec = array.local_data().to_vec();
 //! ```
 use crate::lamellar_env::LamellarEnv;
-use crate::lamellar_request::LamellarRequest;
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion,
     shared::SharedMemoryRegion,
@@ -76,8 +75,8 @@ use crate::{active_messaging::*, LamellarTeam, LamellarTeamRT};
 // use crate::Darc;
 use async_trait::async_trait;
 use enum_dispatch::enum_dispatch;
-use futures_lite::Future;
-use parking_lot::Mutex;
+use futures_util::Future;
+// use parking_lot::Mutex;
 use std::collections::HashMap;
 use std::marker::PhantomData;
 use std::pin::Pin;
@@ -170,6 +169,9 @@ pub use iterator::one_sided_iterator::OneSidedIterator;
 pub(crate) mod operations;
 pub use operations::*;
 
+pub(crate) mod handle;
+pub use handle::*;
+
 pub(crate) type ReduceGen = fn(LamellarByteArray, usize) -> LamellarArcAm;
 
 lazy_static! {
@@ -194,26 +196,26 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
-lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
-impl Dist for bool {}
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+// impl Dist for bool {}
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-// lamellar_impl::generate_ops_for_bool_rt!();
+lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
@@ -256,76 +258,6 @@ pub enum ArrayRdmaCmd {
     GetAm,
 }
 
-#[doc(hidden)]
-#[async_trait]
-pub trait LamellarArrayRequest: Sync + Send {
-    type Output;
-    async fn into_future(mut self: Box<Self>) -> Self::Output;
-    fn wait(self: Box<Self>) -> Self::Output;
-    fn ready(&self) -> bool;
-    fn set_waker(&mut self, waker: futures::task::Waker);
-}
-
-struct ArrayRdmaHandle {
-    reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
-}
-#[async_trait]
-impl LamellarArrayRequest for ArrayRdmaHandle {
-    type Output = ();
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(0..) {
-            req.into_future().await;
-        }
-        ()
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(0..) {
-            req.get();
-        }
-        ()
-    }
-    fn ready(&self) -> bool {
-        self.reqs.iter().all(|req| {
-            // println!("req: {:?}", req.ready());
-            req.ready()
-        })
-    }
-    fn set_waker(&mut self, waker: futures::task::Waker) {
-        for req in self.reqs.iter_mut() {
-            req.set_waker(waker.clone());
-        }
-    }
-}
-
-struct ArrayRdmaAtHandle<T: Dist> {
-    reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
-    buf: OneSidedMemoryRegion<T>,
-}
-#[async_trait]
-impl<T: Dist> LamellarArrayRequest for ArrayRdmaAtHandle<T> {
-    type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(0..) {
-            req.into_future().await;
-        }
-        unsafe { self.buf.as_slice().expect("Data should exist on PE")[0] }
-    }
-    fn wait(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(0..) {
-            req.get();
-        }
-        unsafe { self.buf.as_slice().expect("Data should exist on PE")[0] }
-    }
-    fn ready(&self) -> bool {
-        self.reqs.iter().all(|req| req.ready())
-    }
-    fn set_waker(&mut self, waker: futures::task::Waker) {
-        for req in self.reqs.iter_mut() {
-            req.set_waker(waker.clone());
-        }
-    }
-}
-
 /// Registered memory regions that can be used as input to various LamellarArray RDMA operations.
 // #[enum_dispatch(RegisteredMemoryRegion<T>, SubRegion<T>, TeamFrom<T>,MemoryRegionRDMA<T>,AsBase)]
 #[derive(Clone, Debug)]
@@ -539,7 +471,7 @@ impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
 //     }
 // }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for Vec<T> {
     async fn team_from(input: (Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         input.0
@@ -596,12 +528,12 @@ pub trait TeamFrom<T: ?Sized> {
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
 }
 
-#[async_trait]
+// #[async_trait]
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 /// and to be used within an async context
 pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> {
     /// Converts to this type from the input type
-    async fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
+    fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> impl Future<Output = Self> + Send;
 }
 
 /// Provides the same abstraction as the `TryFrom` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
@@ -774,114 +706,115 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarWriteArra
     }
 }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarReadArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
-            LamellarReadArray::AtomicArray(array) => array.reduce(reduction),
-            LamellarReadArray::LocalLockArray(array) => array.reduce(reduction),
-            LamellarReadArray::GlobalLockArray(array) => array.reduce(reduction),
-            LamellarReadArray::ReadOnlyArray(array) => array.reduce(reduction),
-        }
-    }
-}
+// impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T> {
+//     fn reduce(&self, reduction: &str) -> AmHandle<T> {
+//         match self {
+//             LamellarReadArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
+//             LamellarReadArray::AtomicArray(array) => array.reduce(reduction),
+//             LamellarReadArray::LocalLockArray(array) => array.blocking_reduce(reduction),
+//             LamellarReadArray::GlobalLockArray(array) => array.reduce(reduction),
+//             LamellarReadArray::ReadOnlyArray(array) => array.reduce(reduction),
+//         }
+//     }
+// }
 
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for LamellarReadArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarReadArray::UnsafeArray(array) => unsafe { array.sum() },
-            LamellarReadArray::AtomicArray(array) => array.sum(),
-            LamellarReadArray::LocalLockArray(array) => array.sum(),
-            LamellarReadArray::GlobalLockArray(array) => array.sum(),
-            LamellarReadArray::ReadOnlyArray(array) => array.sum(),
-        }
-    }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarReadArray::UnsafeArray(array) => unsafe { array.prod() },
-            LamellarReadArray::AtomicArray(array) => array.prod(),
-            LamellarReadArray::LocalLockArray(array) => array.prod(),
-            LamellarReadArray::GlobalLockArray(array) => array.prod(),
-            LamellarReadArray::ReadOnlyArray(array) => array.prod(),
-        }
-    }
-}
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for LamellarReadArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarReadArray::UnsafeArray(array) => unsafe { array.max() },
-            LamellarReadArray::AtomicArray(array) => array.max(),
-            LamellarReadArray::LocalLockArray(array) => array.max(),
-            LamellarReadArray::GlobalLockArray(array) => array.max(),
-            LamellarReadArray::ReadOnlyArray(array) => array.max(),
-        }
-    }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarReadArray::UnsafeArray(array) => unsafe { array.min() },
-            LamellarReadArray::AtomicArray(array) => array.min(),
-            LamellarReadArray::LocalLockArray(array) => array.min(),
-            LamellarReadArray::GlobalLockArray(array) => array.min(),
-            LamellarReadArray::ReadOnlyArray(array) => array.min(),
-        }
-    }
-}
+// impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
+//     for LamellarReadArray<T>
+// {
+//     fn sum(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarReadArray::UnsafeArray(array) => unsafe { array.sum() },
+//             LamellarReadArray::AtomicArray(array) => array.sum(),
+//             LamellarReadArray::LocalLockArray(array) => array.sum(),
+//             LamellarReadArray::GlobalLockArray(array) => array.sum(),
+//             LamellarReadArray::ReadOnlyArray(array) => array.sum(),
+//         }
+//     }
+//     fn prod(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarReadArray::UnsafeArray(array) => unsafe { array.prod() },
+//             LamellarReadArray::AtomicArray(array) => array.prod(),
+//             LamellarReadArray::LocalLockArray(array) => array.prod(),
+//             LamellarReadArray::GlobalLockArray(array) => array.prod(),
+//             LamellarReadArray::ReadOnlyArray(array) => array.prod(),
+//         }
+//     }
+// }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarWriteArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
-            LamellarWriteArray::AtomicArray(array) => array.reduce(reduction),
-            LamellarWriteArray::LocalLockArray(array) => array.reduce(reduction),
-            LamellarWriteArray::GlobalLockArray(array) => array.reduce(reduction),
-        }
-    }
-}
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for LamellarWriteArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarWriteArray::UnsafeArray(array) => unsafe { array.sum() },
-            LamellarWriteArray::AtomicArray(array) => array.sum(),
-            LamellarWriteArray::LocalLockArray(array) => array.sum(),
-            LamellarWriteArray::GlobalLockArray(array) => array.sum(),
-        }
-    }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarWriteArray::UnsafeArray(array) => unsafe { array.prod() },
-            LamellarWriteArray::AtomicArray(array) => array.prod(),
-            LamellarWriteArray::LocalLockArray(array) => array.prod(),
-            LamellarWriteArray::GlobalLockArray(array) => array.prod(),
-        }
-    }
-}
+// impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
+//     for LamellarReadArray<T>
+// {
+//     fn max(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarReadArray::UnsafeArray(array) => unsafe { array.max() },
+//             LamellarReadArray::AtomicArray(array) => array.max(),
+//             LamellarReadArray::LocalLockArray(array) => array.max(),
+//             LamellarReadArray::GlobalLockArray(array) => array.max(),
+//             LamellarReadArray::ReadOnlyArray(array) => array.max(),
+//         }
+//     }
+//     fn min(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarReadArray::UnsafeArray(array) => unsafe { array.min() },
+//             LamellarReadArray::AtomicArray(array) => array.min(),
+//             LamellarReadArray::LocalLockArray(array) => array.min(),
+//             LamellarReadArray::GlobalLockArray(array) => array.min(),
+//             LamellarReadArray::ReadOnlyArray(array) => array.min(),
+//         }
+//     }
+// }
 
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for LamellarWriteArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarWriteArray::UnsafeArray(array) => unsafe { array.max() },
-            LamellarWriteArray::AtomicArray(array) => array.max(),
-            LamellarWriteArray::LocalLockArray(array) => array.max(),
-            LamellarWriteArray::GlobalLockArray(array) => array.max(),
-        }
-    }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        match self {
-            LamellarWriteArray::UnsafeArray(array) => unsafe { array.min() },
-            LamellarWriteArray::AtomicArray(array) => array.min(),
-            LamellarWriteArray::LocalLockArray(array) => array.min(),
-            LamellarWriteArray::GlobalLockArray(array) => array.min(),
-        }
-    }
-}
+// impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T> {
+//     fn reduce(&self, reduction: &str) -> AmHandle<T> {
+//         match self {
+//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
+//             LamellarWriteArray::AtomicArray(array) => array.reduce(reduction),
+//             LamellarWriteArray::LocalLockArray(array) => array.reduce(reduction),
+//             LamellarWriteArray::GlobalLockArray(array) => array.reduce(reduction),
+//         }
+//     }
+// }
+// impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
+//     for LamellarWriteArray<T>
+// {
+//     fn sum(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.sum() },
+//             LamellarWriteArray::AtomicArray(array) => array.sum(),
+//             LamellarWriteArray::LocalLockArray(array) => array.sum(),
+//             LamellarWriteArray::GlobalLockArray(array) => array.sum(),
+//         }
+//     }
+//     fn prod(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.prod() },
+//             LamellarWriteArray::AtomicArray(array) => array.prod(),
+//             LamellarWriteArray::LocalLockArray(array) => array.prod(),
+//             LamellarWriteArray::GlobalLockArray(array) => array.prod(),
+//         }
+//     }
+// }
+
+// impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
+//     for LamellarWriteArray<T>
+// {
+//     fn max(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.max() },
+//             LamellarWriteArray::AtomicArray(array) => array.max(),
+//             LamellarWriteArray::LocalLockArray(array) => array.max(),
+//             LamellarWriteArray::GlobalLockArray(array) => array.max(),
+//         }
+//     }
+//     fn min(&self) -> AmHandle<T> {
+//         match self {
+//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.min() },
+//             LamellarWriteArray::AtomicArray(array) => array.min(),
+//             LamellarWriteArray::LocalLockArray(array) => array.min(),
+//             LamellarWriteArray::GlobalLockArray(array) => array.min(),
+//         }
+//     }
+// }
 
 pub(crate) mod private {
     use crate::active_messaging::*;
@@ -890,7 +823,6 @@ pub(crate) mod private {
         /*NativeAtomicArray, GenericAtomicArray,*/ LamellarReadArray, LamellarWriteArray,
         LocalLockArray, ReadOnlyArray, UnsafeArray,
     };
-    use crate::lamellar_request::{LamellarMultiRequest, LamellarRequest};
     use crate::memregion::Dist;
     use crate::LamellarTeamRT;
     use enum_dispatch::enum_dispatch;
@@ -914,31 +846,27 @@ pub(crate) mod private {
     pub(crate) trait ArrayExecAm<T: Dist> {
         fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
         fn team_counters(&self) -> Arc<AMCounters>;
-        fn exec_am_local<F>(&self, am: F) -> Box<dyn LamellarRequest<Output = F::Output>>
+        fn exec_am_local<F>(&self, am: F) -> LocalAmHandle<F::Output>
         where
             F: LamellarActiveMessage + LocalAM + 'static,
         {
             self.team().exec_am_local_tg(am, Some(self.team_counters()))
         }
-        fn exec_am_pe<F>(&self, pe: usize, am: F) -> Box<dyn LamellarRequest<Output = F::Output>>
+        fn exec_am_pe<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
         where
             F: RemoteActiveMessage + LamellarAM + AmDist,
         {
             self.team()
                 .exec_am_pe_tg(pe, am, Some(self.team_counters()))
         }
-        fn exec_arc_am_pe<F>(
-            &self,
-            pe: usize,
-            am: LamellarArcAm,
-        ) -> Box<dyn LamellarRequest<Output = F>>
+        fn exec_arc_am_pe<F>(&self, pe: usize, am: LamellarArcAm) -> AmHandle<F>
         where
             F: AmDist,
         {
             self.team()
                 .exec_arc_am_pe(pe, am, Some(self.team_counters()))
         }
-        fn exec_am_all<F>(&self, am: F) -> Box<dyn LamellarMultiRequest<Output = F::Output>>
+        fn exec_am_all<F>(&self, am: F) -> MultiAmHandle<F::Output>
         where
             F: RemoteActiveMessage + LamellarAM + AmDist,
         {
@@ -1372,7 +1300,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
         &self,
         index: usize,
         dst: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>;
+    ) -> ArrayRdmaHandle;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Retrieves the element in this array located at the specified `index`
@@ -1420,7 +1348,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     /// PE2: array[9] = 3
     /// PE3: array[0] = 0
     ///```
-    fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>>;
+    fn at(&self, index: usize) -> ArrayRdmaAtHandle<T>;
 }
 
 #[doc(hidden)]
@@ -1430,10 +1358,10 @@ pub trait LamellarArrayInternalGet<T: Dist>: LamellarArray<T> {
         &self,
         index: usize,
         dst: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>>;
+    ) -> ArrayRdmaHandle;
 
     // blocking call that gets the value stored and the provided index
-    unsafe fn internal_at(&self, index: usize) -> Box<dyn LamellarArrayRequest<Output = T>>;
+    unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T>;
 }
 
 /// Interface defining low level APIs for copying data from a buffer or local variable into this array
@@ -1516,7 +1444,7 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
         &self,
         index: usize,
         src: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>;
+    ) -> ArrayRdmaHandle;
 }
 
 #[doc(hidden)]
@@ -1527,7 +1455,7 @@ pub trait LamellarArrayInternalPut<T: Dist>: LamellarArray<T> {
         &self,
         index: usize,
         src: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>>;
+    ) -> ArrayRdmaHandle;
 }
 
 /// An interfacing allowing for conveiniently printing the data contained within a lamellar array
@@ -1693,6 +1621,7 @@ pub trait LamellarArrayReduce<T>: LamellarArrayInternalGet<T>
 where
     T: Dist + AmDist + 'static,
 {
+    type Handle;
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1720,116 +1649,116 @@ where
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>>;
+    fn reduce(&self, reduction: &str) -> Self::Handle;
 }
 
 /// Interface for common arithmetic based reductions
-pub trait LamellarArrayArithmeticReduce<T>: LamellarArrayReduce<T>
-where
-    T: Dist + AmDist + ElementArithmeticOps + 'static,
-{
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("sum")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
-    /// the returned sum reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
-    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let sum = array.block_on(array.sum());
-    /// assert_eq!(array.len()*num_pes,sum);
-    ///```
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("prod")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
-    /// the returned prod reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
-    ///     elem.store(i+1);
-    /// });
-    /// array.wait_all();
-    /// array.barrier();
-    /// let prod =  array.block_on(array.prod());
-    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
-    ///```
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
-}
+// pub trait LamellarArrayArithmeticReduce<T>: LamellarArrayReduce<T>
+// where
+//     T: Dist + AmDist + ElementArithmeticOps + 'static,
+// {
+//     #[doc(alias("One-sided", "onesided"))]
+//     /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+//     ///
+//     /// This equivalent to `reduce("sum")`.
+//     ///
+//     /// # One-sided Operation
+//     /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+//     /// the returned sum reduction result is only available on the calling PE
+//     ///
+//     /// # Examples
+//     /// ```
+//     /// use lamellar::array::prelude::*;
+//     /// use rand::Rng;
+//     /// let world = LamellarWorldBuilder::new().build();
+//     /// let num_pes = world.num_pes();
+//     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+//     /// let array_clone = array.clone();
+//     /// let req = array.local_iter().for_each(move |_| {
+//     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+//     ///     array_clone.add(index,1); //randomly at one to an element in the array.
+//     /// });
+//     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+//     /// let sum = array.block_on(array.sum());
+//     /// assert_eq!(array.len()*num_pes,sum);
+//     ///```
+//     fn sum(&self) -> Self::Handle;
+
+//     #[doc(alias("One-sided", "onesided"))]
+//     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+//     ///
+//     /// This equivalent to `reduce("prod")`.
+//     ///
+//     /// # One-sided Operation
+//     /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+//     /// the returned prod reduction result is only available on the calling PE
+//     ///
+//     /// # Examples
+//     /// ```
+//     /// use lamellar::array::prelude::*;
+//     /// let world = LamellarWorldBuilder::new().build();
+//     /// let num_pes = world.num_pes();
+//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+//     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
+//     ///     elem.store(i+1);
+//     /// });
+//     /// array.wait_all();
+//     /// array.barrier();
+//     /// let prod =  array.block_on(array.prod());
+//     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+//     ///```
+//     fn prod(&self) -> Self::Handle;
+// }
 
 /// Interface for common compare based reductions
-pub trait LamellarArrayCompareReduce<T>: LamellarArrayReduce<T>
-where
-    T: Dist + AmDist + ElementComparePartialEqOps + 'static,
-{
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the max element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("max")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
-    /// the returned max reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let max = array.block_on(array.max());
-    /// assert_eq!((array.len()-1)*2,max);
-    ///```
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the min element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("min")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
-    /// the returned min reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let min = array.block_on(array.min());
-    /// assert_eq!(0,min);
-    ///```
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>>;
-}
+// pub trait LamellarArrayCompareReduce<T>: LamellarArrayReduce<T>
+// where
+//     T: Dist + AmDist + ElementComparePartialEqOps + 'static,
+// {
+//     #[doc(alias("One-sided", "onesided"))]
+//     /// Find the max element in the entire destributed array, returning to the calling PE
+//     ///
+//     /// This equivalent to `reduce("max")`.
+//     ///
+//     /// # One-sided Operation
+//     /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+//     /// the returned max reduction result is only available on the calling PE
+//     ///
+//     /// # Examples
+//     /// ```
+//     /// use lamellar::array::prelude::*;
+//     /// let world = LamellarWorldBuilder::new().build();
+//     /// let num_pes = world.num_pes();
+//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+//     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+//     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+//     /// let max = array.block_on(array.max());
+//     /// assert_eq!((array.len()-1)*2,max);
+//     ///```
+//     fn max(&self) -> Self::Handle;
+
+//     #[doc(alias("One-sided", "onesided"))]
+//     /// Find the min element in the entire destributed array, returning to the calling PE
+//     ///
+//     /// This equivalent to `reduce("min")`.
+//     ///
+//     /// # One-sided Operation
+//     /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+//     /// the returned min reduction result is only available on the calling PE
+//     ///
+//     /// # Examples
+//     /// ```
+//     /// use lamellar::array::prelude::*;
+//     /// let world = LamellarWorldBuilder::new().build();
+//     /// let num_pes = world.num_pes();
+//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+//     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+//     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+//     /// let min = array.block_on(array.min());
+//     /// assert_eq!(0,min);
+//     ///```
+//     fn min(&self) -> Self::Handle;
+// }
 
 /// This procedural macro is used to enable the execution of user defined reductions on LamellarArrays.
 ///
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 3d1f863f..f7734768 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -12,7 +12,7 @@ use crate::memregion::Dist;
 use std::any::TypeId;
 use std::collections::HashSet;
 // use std::sync::atomic::Ordering;
-// use std::sync::Arc;
+use std::sync::Arc;
 
 lazy_static! {
     pub(crate) static ref NATIVE_ATOMICS: HashSet<TypeId> = {
@@ -499,18 +499,6 @@ impl<T: Dist + std::fmt::Debug> std::fmt::Debug for AtomicElement<T> {
     }
 }
 
-impl<T: Dist + std::fmt::Debug + std::iter::Sum> std::iter::Sum for AtomicElement<T> {
-    fn sum<I>(iter: I) -> Self
-    where
-        I: Iterator<Item = Self>,
-    {
-        LocalGenericAtomicElement {
-            val: Mutex::new(iter.map(|e| e.load()).sum()),
-        }
-        .into()
-    }
-}
-
 ///A safe abstraction of a distributed array, providing read/write access protect by atomic elements
 ///
 /// If the type of the Array is an integer type (U8, usize, i32, i16, etc.) the array will use the appropriate Atomic* type underneath.
@@ -1084,7 +1072,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
     }
 }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
@@ -1179,8 +1167,8 @@ impl<T: Dist> From<AtomicByteArray> for AtomicArray<T> {
     }
 }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
-    fn reduce(&self, reduction: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + 'static> AtomicArray<T> {
+    pub fn reduce(&self, reduction: &str) -> AmHandle<T> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
             AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
@@ -1188,32 +1176,28 @@ impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for AtomicArray<T> {
     }
 }
 
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for AtomicArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
+    pub fn sum(&self) -> AmHandle<T> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.sum(),
             AtomicArray::GenericAtomicArray(array) => array.sum(),
         }
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn prod(&self) -> AmHandle<T> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.prod(),
             AtomicArray::GenericAtomicArray(array) => array.prod(),
         }
     }
 }
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for AtomicArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
+    pub fn max(&self) -> AmHandle<T> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.max(),
             AtomicArray::GenericAtomicArray(array) => array.max(),
         }
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn min(&self) -> AmHandle<T> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.min(),
             AtomicArray::GenericAtomicArray(array) => array.min(),
diff --git a/src/array/atomic/iteration.rs b/src/array/atomic/iteration.rs
index bdcc014f..222b5813 100644
--- a/src/array/atomic/iteration.rs
+++ b/src/array/atomic/iteration.rs
@@ -1,9 +1,6 @@
 use crate::array::atomic::*;
-
-use crate::array::iterator::distributed_iterator::{
-    DistIteratorLauncher, DistributedIterator, IndexedDistributedIterator,
-};
-use crate::array::iterator::local_iterator::{IndexedLocalIterator, LocalIterator};
+use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{private::*, LamellarArrayIterators, LamellarArrayMutIterators};
 use crate::array::*;
@@ -213,156 +210,3 @@ impl<T: Dist> LamellarArrayMutIterators<T> for AtomicArray<T> {
         AtomicLocalIter::new(self.clone(), 0, 0)
     }
 }
-
-// impl<T: Dist> DistIteratorLauncher for AtomicArray<T> {
-//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.data.global_index_from_local(index, chunk_size)
-//     }
-
-//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.data.subarray_index_from_local(index, chunk_size)
-//     }
-
-//     fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         self.data.for_each(iter, op)
-//     }
-//     fn for_each_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         self.data.for_each_with_schedule(sched, iter, op)
-//     }
-//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         self.data.for_each_async(iter, op)
-//     }
-//     fn for_each_async_with_schedule<I, F, Fut>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         self.data.for_each_async_with_schedule(sched, iter, op)
-//     }
-
-//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: for<'a>  TeamFrom<(&'a Vec<I::Item>,Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         self.data.collect(iter, d)
-//     }
-//     fn collect_async<I, A, B>(
-//         &self,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> Pin<Box<dyn Future<Output = A> + Send>>
-//     where
-//         I: DistributedIterator + 'static,
-//        I::Item: Future<Output = B> + Send  + 'static,
-//         B: Dist + ArrayOps,
-//         A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-//     {
-//         self.data.collect_async(iter, d)
-//     }
-//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.data.team().clone()
-//     }
-// }
-
-// impl<T: Dist> LocalIteratorLauncher for AtomicArray<T> {
-//     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.data.local_global_index_from_local(index, chunk_size)
-//     }
-
-//     fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.data.local_subarray_index_from_local(index, chunk_size)
-//     }
-
-//     fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: LocalIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         self.data.for_each(iter, op)
-//     }
-//     fn for_each_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: LocalIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         self.data.for_each_with_schedule(sched, iter, op)
-//     }
-//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: LocalIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         self.data.for_each_async(iter, op)
-//     }
-//     fn for_each_async_with_schedule<I, F, Fut>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> Pin<Box<dyn Future<Output = ()> + Send>>
-//     where
-//         I: LocalIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         self.data.for_each_async_with_schedule(sched, iter, op)
-//     }
-
-//     // fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
-//     // where
-//     //     I: LocalIterator + 'static,
-//     //     I::Item: Dist + ArrayOps,
-//     //     A: for<'a>  TeamFrom<(&'a Vec<I::Item>,Distribution)> + SyncSend + Clone + 'static,
-//     // {
-//     //     self.data.collect(iter, d)
-//     // }
-//     // fn collect_async<I, A, B>(
-//     //     &self,
-//     //     iter: &I,
-//     //     d: Distribution,
-//     // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-//     // where
-//     //     I: LocalIterator + 'static,
-//     //    I::Item: Future<Output = B> + Send  + 'static,
-//     //     B: Dist + ArrayOps,
-//     //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-//     // {
-//     //     self.data.collect_async(iter, d)
-//     // }
-
-//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.data.team().clone()
-//     }
-// }
diff --git a/src/array/atomic/rdma.rs b/src/array/atomic/rdma.rs
index 8744224b..b4a0ff81 100644
--- a/src/array/atomic/rdma.rs
+++ b/src/array/atomic/rdma.rs
@@ -45,13 +45,13 @@ impl<T: Dist> LamellarArrayGet<T> for AtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.get(index, buf),
             AtomicArray::GenericAtomicArray(array) => array.get(index, buf),
         }
     }
-    fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    fn at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.at(index),
             AtomicArray::GenericAtomicArray(array) => array.at(index),
@@ -64,7 +64,7 @@ impl<T: Dist> LamellarArrayPut<T> for AtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.put(index, buf),
             AtomicArray::GenericAtomicArray(array) => array.put(index, buf),
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index a5a7c099..d2ffbef4 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -567,7 +567,7 @@ impl<T: Dist> GenericAtomicArray<T> {
         self.locks[index].lock()
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_  {
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
         self.array.async_barrier()
     }
 }
@@ -589,7 +589,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray
     }
 }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
@@ -824,30 +824,24 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GenericAtomicArray<T> {
     }
 }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GenericAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        self.array
-            .reduce_data(op, self.clone().into())
-            .into_future()
+impl<T: Dist + AmDist + 'static> GenericAtomicArray<T> {
+    pub fn reduce(&self, op: &str) -> AmHandle<T> {
+        self.array.reduce_data(op, self.clone().into())
     }
 }
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for GenericAtomicArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GenericAtomicArray<T> {
+    pub fn sum(&self) -> AmHandle<T> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn prod(&self) -> AmHandle<T> {
         self.reduce("prod")
     }
 }
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for GenericAtomicArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GenericAtomicArray<T> {
+    pub fn max(&self) -> AmHandle<T> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn min(&self) -> AmHandle<T> {
         self.reduce("min")
     }
 }
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index 20563370..ded8bf2d 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -1,8 +1,6 @@
 use crate::array::generic_atomic::*;
-use crate::array::iterator::distributed_iterator::{
-    DistIteratorLauncher, DistributedIterator, IndexedDistributedIterator,
-};
-use crate::array::iterator::local_iterator::{LocalIterator, LocalIteratorLauncher};
+use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
     private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
@@ -225,7 +223,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
     // }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -237,14 +235,14 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -257,7 +255,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -266,7 +264,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -280,7 +278,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -289,7 +287,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -303,7 +301,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -311,11 +309,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
-    fn collect_async<I, A, B>(
-        &self,
-        iter: &I,
-        d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -330,7 +324,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -340,25 +334,21 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -366,11 +356,7 @@ impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
         DistIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -393,7 +379,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
             .local_subarray_index_from_local(index, chunk_size)
     }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -405,14 +391,14 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -425,7 +411,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -434,7 +420,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
         LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -448,7 +434,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -477,7 +463,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     //     self.array.reduce_async_with_schedule(sched, iter, op)
     // }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -491,7 +477,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -529,25 +515,21 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     //     self.array.collect_async_with_schedule(sched, iter, d)
     // }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
@@ -555,11 +537,7 @@ impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
         LocalIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 0f11fa99..842db995 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -1,3 +1,5 @@
+use std::collections::VecDeque;
+
 use crate::array::generic_atomic::*;
 use crate::array::private::ArrayExecAm;
 use crate::array::LamellarWrite;
@@ -9,25 +11,27 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
-    unsafe fn internal_at(&self, index: usize) -> Box<dyn LamellarArrayRequest<Output = T>> {
+    unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.clone().into(),
         });
-        Box::new(ArrayRdmaAtHandle {
-            reqs: vec![req],
+        ArrayRdmaAtHandle {
+            req: Some(req),
             buf: buf,
-        })
+        }
     }
 }
 
@@ -36,14 +40,16 @@ impl<T: Dist> LamellarArrayGet<T> for GenericAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_get(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_get(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
-    fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        unsafe { self.internal_at(index).into_future() }
+    fn at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
+        unsafe { self.internal_at(index) }
     }
 }
 
@@ -52,13 +58,15 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GenericAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitPutAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
 }
 
@@ -67,10 +75,12 @@ impl<T: Dist> LamellarArrayPut<T> for GenericAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_put(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_put(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
 }
@@ -102,7 +112,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+            reqs.push(self.array.exec_am_pe(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -233,7 +243,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                            reqs.push(self.array.exec_am_pe(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -286,7 +296,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                        reqs.push(self.array.exec_am_pe(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index e9cc9662..8aa7b58a 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -8,9 +8,14 @@ use crate::darc::global_rw_darc::{
     GlobalRwDarc, GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
 };
 use crate::darc::DarcMode;
+use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
+
+use pin_project::pin_project;
+
 use std::ops::{Deref, DerefMut};
+use std::task::{Context, Poll, Waker};
 
 /// A safe abstraction of a distributed array, providing read/write access protected by locks.
 ///
@@ -72,7 +77,9 @@ impl GlobalLockByteArrayWeak {
 #[derive(Debug)]
 pub struct GlobalLockMutLocalData<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
-    _lock_guard: GlobalRwDarcWriteGuard<()>,
+    start_index: usize,
+    end_index: usize,
+    lock_guard: GlobalRwDarcWriteGuard<()>,
 }
 
 // impl<T: Dist> Drop for GlobalLockMutLocalData<T>{
@@ -84,12 +91,12 @@ pub struct GlobalLockMutLocalData<T: Dist> {
 impl<T: Dist> Deref for GlobalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+        unsafe { &self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
     }
 }
 impl<T: Dist> DerefMut for GlobalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+        unsafe { &mut self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
     }
 }
 
@@ -104,7 +111,9 @@ impl<T: Dist> DerefMut for GlobalLockMutLocalData<T> {
 #[derive(Debug)]
 pub struct GlobalLockCollectiveMutLocalData<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
-    _lock_guard: GlobalRwDarcCollectiveWriteGuard<()>,
+    start_index: usize,
+    end_index: usize,
+    lock_guard: GlobalRwDarcCollectiveWriteGuard<()>,
 }
 
 // impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<T>{
@@ -116,12 +125,12 @@ pub struct GlobalLockCollectiveMutLocalData<T: Dist> {
 impl<T: Dist> Deref for GlobalLockCollectiveMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+        unsafe { &self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
     }
 }
 impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+        unsafe { &mut self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
     }
 }
 
@@ -135,7 +144,9 @@ impl<T: Dist> DerefMut for GlobalLockCollectiveMutLocalData<T> {
 /// When the instance is dropped the lock is released.
 pub struct GlobalLockLocalData<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
-    lock: GlobalRwDarc<()>,
+    // lock: GlobalRwDarc<()>,
+    start_index: usize,
+    end_index: usize,
     lock_guard: GlobalRwDarcReadGuard<()>,
 }
 
@@ -149,12 +160,30 @@ impl<T: Dist> Clone for GlobalLockLocalData<T> {
     fn clone(&self) -> Self {
         GlobalLockLocalData {
             array: self.array.clone(),
-            lock: self.lock.clone(),
+            start_index: self.start_index,
+            end_index: self.end_index,
+            // lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
 }
 
+impl<T: Dist> Deref for GlobalLockLocalData<T> {
+    type Target = [T];
+    fn deref(&self) -> &Self::Target {
+        unsafe { &self.array.array.local_as_slice()[self.start_index..self.end_index] }
+    }
+}
+
+// impl<T: Dist> Drop for GlobalLockLocalData<T> {
+//     fn drop(&mut self) {
+//         println!(
+//             "release GlobalLockLocalData lock! {:?} ",
+//             std::thread::current().id(),
+//         );
+//     }
+// }
+
 impl<T: Dist> GlobalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
@@ -173,8 +202,10 @@ impl<T: Dist> GlobalLockLocalData<T> {
     ///```
     pub fn into_sub_data(self, start: usize, end: usize) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
-            array: self.array.sub_array(start..end),
-            lock: self.lock,
+            array: self.array.clone(),
+            start_index: start,
+            end_index: end,
+            // lock: self.lock,
             lock_guard: self.lock_guard,
         }
     }
@@ -185,7 +216,8 @@ impl<T: Dist + serde::Serialize> serde::Serialize for GlobalLockLocalData<T> {
     where
         S: serde::Serializer,
     {
-        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
+        unsafe { &self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
+            .serialize(serializer)
     }
 }
 
@@ -211,17 +243,54 @@ impl<'a, T: Dist> IntoIterator for &'a GlobalLockLocalData<T> {
     type IntoIter = GlobalLockLocalDataIter<'a, T>;
     fn into_iter(self) -> Self::IntoIter {
         GlobalLockLocalDataIter {
-            data: unsafe { self.array.array.local_as_mut_slice() },
+            data: unsafe {
+                &self.array.array.local_as_mut_slice()[self.start_index..self.end_index]
+            },
             index: 0,
         }
     }
 }
 
-impl<T: Dist> Deref for GlobalLockLocalData<T> {
-    type Target = [T];
+#[derive(Clone)]
+pub struct GlobalLockReadGuard<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
+    lock_guard: GlobalRwDarcReadGuard<()>,
+}
 
-    fn deref(&self) -> &Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+impl<T: Dist> GlobalLockReadGuard<T> {
+    pub fn local_data(&self) -> GlobalLockLocalData<T> {
+        GlobalLockLocalData {
+            array: self.array.clone(),
+            start_index: 0,
+            end_index: self.array.num_elems_local(),
+            // lock: self.lock.clone(),
+            lock_guard: self.lock_guard.clone(),
+        }
+    }
+}
+
+pub struct GlobalLockWriteGuard<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
+    lock_guard: GlobalRwDarcWriteGuard<()>,
+}
+
+impl<T: Dist> From<GlobalLockMutLocalData<T>> for GlobalLockWriteGuard<T> {
+    fn from(data: GlobalLockMutLocalData<T>) -> Self {
+        GlobalLockWriteGuard {
+            array: data.array,
+            lock_guard: data.lock_guard,
+        }
+    }
+}
+
+impl<T: Dist> GlobalLockWriteGuard<T> {
+    pub fn local_data(self) -> GlobalLockMutLocalData<T> {
+        GlobalLockMutLocalData {
+            array: self.array.clone(),
+            start_index: 0,
+            end_index: self.array.num_elems_local(),
+            lock_guard: self.lock_guard,
+        }
     }
 }
 
@@ -276,6 +345,40 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
+    pub fn blocking_read_lock(&self) -> GlobalLockReadGuard<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            GlobalLockReadGuard {
+                array: self_clone.clone(),
+                lock_guard: self_clone.lock.read().await,
+            }
+        })
+    }
+
+    pub async fn read_lock(&self) -> GlobalLockReadGuard<T> {
+        GlobalLockReadGuard {
+            array: self.clone(),
+            lock_guard: self.lock.read().await,
+        }
+    }
+
+    pub fn blocking_write_lock(&self) -> GlobalLockWriteGuard<T> {
+        let self_clone: GlobalLockArray<T> = self.clone();
+        self.block_on(async move {
+            GlobalLockWriteGuard {
+                array: self_clone.clone(),
+                lock_guard: self_clone.lock.write().await,
+            }
+        })
+    }
+
+    pub async fn write_lock(&self) -> GlobalLockWriteGuard<T> {
+        GlobalLockWriteGuard {
+            array: self.clone(),
+            lock_guard: self.lock.write().await,
+        }
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
     ///
@@ -302,7 +405,9 @@ impl<T: Dist> GlobalLockArray<T> {
         self.block_on(async move {
             GlobalLockLocalData {
                 array: self_clone.clone(),
-                lock: self_clone.lock.clone(),
+                start_index: 0,
+                end_index: self_clone.array.num_elems_local(),
+                // lock: self_clone.lock.clone(),
                 lock_guard: self_clone.lock.read().await,
             }
         })
@@ -333,7 +438,9 @@ impl<T: Dist> GlobalLockArray<T> {
     pub async fn read_local_data(&self) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
             array: self.clone(),
-            lock: self.lock.clone(),
+            start_index: 0,
+            end_index: self.array.num_elems_local(),
+            // lock: self.lock.clone(),
             lock_guard: self.lock.read().await,
         }
     }
@@ -364,8 +471,10 @@ impl<T: Dist> GlobalLockArray<T> {
         self.block_on(async move {
             let lock = self_clone.lock.write().await;
             let data = GlobalLockMutLocalData {
-                array: self_clone,
-                _lock_guard: lock,
+                array: self_clone.clone(),
+                start_index: 0,
+                end_index: self_clone.array.num_elems_local(),
+                lock_guard: lock,
             };
             // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
             data
@@ -398,7 +507,9 @@ impl<T: Dist> GlobalLockArray<T> {
         let lock = self.lock.write().await;
         let data = GlobalLockMutLocalData {
             array: self.clone(),
-            _lock_guard: lock,
+            start_index: 0,
+            end_index: self.array.num_elems_local(),
+            lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
@@ -428,8 +539,10 @@ impl<T: Dist> GlobalLockArray<T> {
         self.block_on(async move {
             let lock = self_clone.lock.collective_write().await;
             let data = GlobalLockCollectiveMutLocalData {
-                array: self_clone,
-                _lock_guard: lock,
+                array: self_clone.clone(),
+                start_index: 0,
+                end_index: self_clone.array.num_elems_local(),
+                lock_guard: lock,
             };
             // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
             data
@@ -462,7 +575,9 @@ impl<T: Dist> GlobalLockArray<T> {
         let lock = self.lock.collective_write().await;
         let data = GlobalLockCollectiveMutLocalData {
             array: self.clone(),
-            _lock_guard: lock,
+            start_index: 0,
+            end_index: self.array.num_elems_local(),
+            lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
@@ -608,7 +723,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// read_only_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_local_lock(self) -> LocalLockArray<T> {
+    pub fn into_local_lock(self) -> GlobalLockArray<T> {
         // println!("GlobalLock into_read_only");
         self.array.into()
     }
@@ -673,7 +788,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T>
     }
 }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
@@ -912,56 +1027,56 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GlobalLockArray<T> {
 }
 
 #[doc(hidden)]
+#[pin_project]
 pub struct GlobalLockArrayReduceHandle<T: Dist + AmDist> {
-    req: Box<dyn LamellarRequest<Output = T>>,
-    _lock_guard: GlobalRwDarcReadGuard<()>,
+    req: AmHandle<T>,
+    lock_guard: GlobalRwDarcReadGuard<()>,
 }
 
-#[async_trait]
 impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
-    type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        self.req.into_future().await
+    fn blocking_wait(self) -> Self::Output {
+        self.req.blocking_wait()
     }
-    fn get(&self) -> Self::Output {
-        self.req.get()
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.req.ready_or_set_waker(waker)
     }
-    fn ready(&self) -> bool {
-        self.req.ready()
+    fn val(&self) -> Self::Output {
+        self.req.val()
     }
-    fn set_waker(&mut self, waker: futures::task::Waker) {
-        self.req.set_waker(waker)
+}
+
+impl<T: Dist + AmDist> Future for GlobalLockArrayReduceHandle<T> {
+    type Output = T;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.req.ready_or_set_waker(cx.waker()) {
+            true => Poll::Ready(this.req.val()),
+            false => Poll::Pending,
+        }
     }
 }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for GlobalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let lock: GlobalRwDarc<()> = self.lock.clone();
-        let lock = self.array.block_on(async move { lock.read().await });
-        Box::new(GlobalLockArrayReduceHandle {
-            req: self.array.reduce_data(op, self.clone().into()),
-            _lock_guard: lock,
-        })
-        .into_future()
+impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
+    pub fn reduce(self, op: &str) -> GlobalLockArrayReduceHandle<T> {
+        GlobalLockArrayReduceHandle {
+            req: self.array.array.reduce_data(op, self.array.clone().into()),
+            lock_guard: self.lock_guard.clone(),
+        }
     }
 }
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for GlobalLockArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
+    pub fn sum(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn prod(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
 }
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for GlobalLockArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuard<T> {
+    pub fn max(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn min(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index b6adbd47..81354f24 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -1,10 +1,6 @@
 use crate::array::global_lock_atomic::*;
-use crate::array::iterator::distributed_iterator::{
-    DistIteratorLauncher, DistributedIterator, IndexedDistributedIterator,
-};
-use crate::array::iterator::local_iterator::{
-    IndexedLocalIterator, LocalIterator, LocalIteratorLauncher,
-};
+use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
     private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
@@ -433,7 +429,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
     // }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -445,14 +441,14 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -465,7 +461,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -474,7 +470,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -488,7 +484,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -497,7 +493,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -511,7 +507,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -519,11 +515,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
-    fn collect_async<I, A, B>(
-        &self,
-        iter: &I,
-        d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -538,7 +530,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -548,25 +540,21 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -574,11 +562,7 @@ impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
         DistIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -601,7 +585,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
             .local_subarray_index_from_local(index, chunk_size)
     }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -613,14 +597,14 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -633,7 +617,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -642,7 +626,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
         LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -656,7 +640,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -685,7 +669,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     //     self.array.reduce_async_with_schedule(sched, iter, op)
     // }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -699,7 +683,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -737,25 +721,21 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     //     self.array.collect_async_with_schedule(sched, iter, d)
     // }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
@@ -763,11 +743,7 @@ impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
         LocalIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index 94d01b73..2d257770 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -3,16 +3,14 @@ use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
 use crate::array::{
     ArrayRdmaAtHandle, ArrayRdmaHandle, Distribution, LamellarArrayGet, LamellarArrayInternalGet,
     LamellarArrayInternalPut, LamellarArrayPut, LamellarArrayRdmaInput, LamellarArrayRdmaOutput,
-    LamellarArrayRequest, LamellarEnv, LamellarRead, LamellarWrite, TeamTryInto,
+    LamellarEnv, LamellarRead, LamellarWrite, TeamTryInto,
 };
 use crate::memregion::{
     AsBase, Dist, LamellarMemoryRegion, OneSidedMemoryRegion, RTMemoryRegionRDMA,
     RegisteredMemoryRegion, SubRegion,
 };
 
-use futures::Future;
-use std::collections::HashMap;
-use std::pin::Pin;
+use std::collections::{HashMap, VecDeque};
 
 impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
     // fn iget<U: TeamTryInto<LamellarArrayRdmaOutput<T>> + LamellarWrite>(&self, index: usize, buf: U) {
@@ -22,25 +20,27 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
-    unsafe fn internal_at(&self, index: usize) -> Box<dyn LamellarArrayRequest<Output = T>> {
+    unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.clone().into(),
         });
-        Box::new(ArrayRdmaAtHandle {
-            reqs: vec![req],
+        ArrayRdmaAtHandle {
+            req: Some(req),
             buf: buf,
-        })
+        }
     }
 }
 
@@ -49,14 +49,16 @@ impl<T: Dist> LamellarArrayGet<T> for GlobalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_get(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_get(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
-    fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        unsafe { self.internal_at(index).into_future() }
+    fn at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
+        unsafe { self.internal_at(index) }
     }
 }
 
@@ -65,13 +67,15 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GlobalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitPutAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
 }
 
@@ -80,10 +84,12 @@ impl<T: Dist> LamellarArrayPut<T> for GlobalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_put(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_put(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
 }
@@ -115,7 +121,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+            reqs.push(self.array.exec_am_pe(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -220,7 +226,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                         .into(),
                                     pe: self.array.my_pe(),
                                 };
-                                reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                                reqs.push(self.array.exec_am_pe(pe, remote_am));
                             } else {
                                 let remote_am = GlobalLockRemoteSmallPutAm {
                                     array: self.array.clone().into(), //inner of the indices we need to place data into
@@ -230,7 +236,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                         [cur_index..(cur_index + u8_buf_len)]
                                         .to_vec(),
                                 };
-                                reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                                reqs.push(self.array.exec_am_pe(pe, remote_am));
                             }
                             cur_index += u8_buf_len;
                         } else {
@@ -285,7 +291,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                        reqs.push(self.array.exec_am_pe(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/handle.rs b/src/array/handle.rs
new file mode 100644
index 00000000..4b7f2d6b
--- /dev/null
+++ b/src/array/handle.rs
@@ -0,0 +1,95 @@
+use std::{
+    collections::VecDeque,
+    future::Future,
+    pin::Pin,
+    task::{Context, Poll, Waker},
+};
+
+use pin_project::pin_project;
+
+use crate::{
+    active_messaging::{AmHandle, LocalAmHandle},
+    lamellar_request::LamellarRequest,
+    Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
+};
+
+pub struct ArrayRdmaHandle {
+    pub(crate) reqs: VecDeque<AmHandle<()>>,
+}
+
+impl LamellarRequest for ArrayRdmaHandle {
+    fn blocking_wait(mut self) -> Self::Output {
+        for req in self.reqs.drain(0..) {
+            req.blocking_wait();
+        }
+        ()
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let mut ready = true;
+        for req in self.reqs.iter_mut() {
+            ready &= req.ready_or_set_waker(waker);
+        }
+        ready
+    }
+    fn val(&self) -> Self::Output {
+        for req in self.reqs.iter() {
+            req.val();
+        }
+    }
+}
+
+impl Future for ArrayRdmaHandle {
+    type Output = ();
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        while let Some(mut req) = self.reqs.pop_front() {
+            if !req.ready_or_set_waker(cx.waker()) {
+                self.reqs.push_front(req);
+                return Poll::Pending;
+            }
+        }
+        Poll::Ready(())
+    }
+}
+
+#[pin_project]
+pub struct ArrayRdmaAtHandle<T: Dist> {
+    pub(crate) req: Option<LocalAmHandle<()>>,
+    pub(crate) buf: OneSidedMemoryRegion<T>,
+}
+
+impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
+    fn blocking_wait(self) -> Self::Output {
+        match self.req {
+            Some(req) => req.blocking_wait(),
+            None => {}, //this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
+        }
+        unsafe { self.buf.as_slice().expect("Data should exist on PE")[0] }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        if let Some(req) = &mut self.req {
+            req.ready_or_set_waker(waker)
+        } else {
+            true
+        }
+    }
+    fn val(&self) -> Self::Output {
+        unsafe { self.buf.as_slice().expect("Data should exist on PE")[0] }
+    }
+}
+
+impl<T: Dist> Future for ArrayRdmaAtHandle<T> {
+    type Output = T;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.req {
+            Some(req) => {
+                if !req.ready_or_set_waker(cx.waker()) {
+                    return Poll::Pending;
+                }
+                
+            }
+            None => {},//this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
+        }
+        Poll::Ready(unsafe { this.buf.as_slice().expect("Data should exist on PE")[0] })
+    }
+}
diff --git a/src/array/iterator/consumer.rs b/src/array/iterator/consumer.rs
index 7018bc46..35589d04 100644
--- a/src/array/iterator/consumer.rs
+++ b/src/array/iterator/consumer.rs
@@ -3,13 +3,13 @@
 //!
 
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::IterRequest;
-use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
 use parking_lot::Mutex;
 use rand::prelude::SliceRandom;
 use rand::thread_rng;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
@@ -183,14 +183,15 @@ pub(crate) trait IterConsumer: SyncSend {
     type AmOutput;
     type Output;
     type Item;
+    type Handle;
     fn init(&self, start: usize, cnt: usize) -> Self;
     fn next(&mut self) -> Option<Self::Item>;
     fn into_am(&self, schedule: IterSchedule) -> LamellarArcLocalAm;
     fn create_handle(
         self,
         team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>>;
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle;
     fn max_elems(&self, in_elems: usize) -> usize;
 }
 
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 5c1cdd39..3f06b7f2 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -47,7 +47,7 @@ use crate::LamellarTeamRT;
 use crate::active_messaging::SyncSend;
 
 use enum_dispatch::enum_dispatch;
-use futures::Future;
+use futures_util::Future;
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -69,12 +69,12 @@ use std::sync::Arc;
 //     type Output = ();
 //     async fn into_future(mut self: Box<Self>) -> Self::Output {
 //         for req in self.reqs.drain(..) {
-//             req.into_future().await;
+//             req.await;
 //         }
 //     }
 //     fn wait(mut self: Box<Self>) -> Self::Output {
 //         for req in self.reqs.drain(..) {
-//             req.get();
+//             req.blocking_wait();
 //         }
 //     }
 // }
@@ -130,7 +130,7 @@ use std::sync::Arc;
 //     async fn into_future(mut self: Box<Self>) -> Self::Output {
 //         let mut local_vals = vec![];
 //         for req in self.reqs.drain(0..) {
-//             let v = req.into_future().await;
+//             let v = req.await;
 //             local_vals.extend(v);
 //         }
 //         self.create_array(&local_vals)
@@ -138,7 +138,7 @@ use std::sync::Arc;
 //     fn wait(mut self: Box<Self>) -> Self::Output {
 //         let mut local_vals = vec![];
 //         for req in self.reqs.drain(0..) {
-//             let v = req.get();
+//             let v = req.blocking_wait();
 //             local_vals.extend(v);
 //         }
 //         self.create_array(&local_vals)
@@ -148,7 +148,7 @@ use std::sync::Arc;
 #[doc(hidden)]
 #[enum_dispatch]
 pub trait DistIteratorLauncher {
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static;
@@ -158,16 +158,12 @@ pub trait DistIteratorLauncher {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static;
 
-    fn for_each_async<I, F, Fut>(
-        &self,
-        iter: &I,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -178,17 +174,13 @@ pub trait DistIteratorLauncher {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
         Fut: Future<Output = ()> + Send + 'static;
 
-    fn reduce<I, F>(
-        &self,
-        iter: &I,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -199,13 +191,13 @@ pub trait DistIteratorLauncher {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static;
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -216,17 +208,13 @@ pub trait DistIteratorLauncher {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
-    fn collect_async<I, A, B>(
-        &self,
-        iter: &I,
-        d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -238,35 +226,27 @@ pub trait DistIteratorLauncher {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static;
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static;
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum;
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum;
@@ -475,7 +455,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///         .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     /// );
     ///```
-    fn for_each<F>(&self, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<F>(&self, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
     {
@@ -512,7 +492,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
-    fn for_each_async<F, Fut>(&self, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<F, Fut>(&self, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
         Fut: Future<Output = ()> + Send + 'static,
@@ -537,11 +517,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
     /// array.wait_all();
     ///```
-    fn for_each_with_schedule<F>(
-        &self,
-        sched: Schedule,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
     {
@@ -573,11 +549,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// });
     /// array.wait_all();
     ///```
-    fn for_each_async_with_schedule<F, Fut>(
-        &self,
-        sched: Schedule,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
         Fut: Future<Output = ()> + Send + 'static,
@@ -601,7 +573,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn reduce<F>(&self, op: F) -> Pin<Box<dyn Future<Output = Option<Self::Item>> + Send>>
+    fn reduce<F>(&self, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
@@ -624,11 +596,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn reduce_with_schedule<F>(
-        &self,
-        sched: Schedule,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<Self::Item>> + Send>>
+    fn reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
@@ -663,7 +631,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn collect<A>(&self, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<A>(&self, d: Distribution) -> DistIterCollectHandle<Self::Item, A>
     where
         // &'static Self: DistributedIterator + 'static,
         Self::Item: Dist + ArrayOps,
@@ -708,7 +676,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-    fn collect_async<A, T>(&self, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<A, T>(&self, d: Distribution) -> DistIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
         T: Dist + ArrayOps,
@@ -732,7 +700,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().filter(|elem|  elem < 10).count();
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn count(&self) -> Pin<Box<dyn Future<Output = usize> + Send>> {
+    fn count(&self) -> DistIterCountHandle {
         self.array().count(self)
     }
 
@@ -750,7 +718,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().filter(|elem|  elem < 10).count_with_schedule(Schedule::Dynamic);
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn count_with_schedule(&self, sched: Schedule) -> Pin<Box<dyn Future<Output = usize> + Send>> {
+    fn count_with_schedule(&self, sched: Schedule) -> DistIterCountHandle {
         self.array().count_with_schedule(sched, self)
     }
 
@@ -772,7 +740,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn sum(&self) -> Pin<Box<dyn Future<Output = Self::Item> + Send>>
+    fn sum(&self) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
     {
@@ -797,7 +765,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn sum_with_schedule(&self, sched: Schedule) -> Pin<Box<dyn Future<Output = Self::Item> + Send>>
+    fn sum_with_schedule(&self, sched: Schedule) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
     {
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 85a6b800..156baa60 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -1,18 +1,21 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::array::operations::ArrayOps;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 
-use async_trait::async_trait;
 use core::marker::PhantomData;
-use futures::Future;
+use futures_util::{ready, Future};
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct Collect<I, A> {
@@ -40,6 +43,7 @@ where
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
     type Item = (usize, I::Item);
+    type Handle = DistIterCollectHandle<I::Item, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Collect {
             iter: self.iter.init(start, cnt),
@@ -59,14 +63,14 @@ where
     fn create_handle(
         self,
         team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(DistIterCollectHandle {
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        DistIterCollectHandle {
             reqs,
             distribution: self.distribution,
             team,
-            _phantom: self._phantom,
-        })
+            state: State::ReqsPending(Vec::new()),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -100,6 +104,7 @@ where
     type AmOutput = Vec<(usize, B)>;
     type Output = A;
     type Item = (usize, I::Item);
+    type Handle = DistIterCollectHandle<B, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         CollectAsync {
             iter: self.iter.init(start, cnt),
@@ -119,14 +124,14 @@ where
     fn create_handle(
         self,
         team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(DistIterCollectHandle {
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        DistIterCollectHandle {
             reqs,
             distribution: self.distribution,
             team,
-            _phantom: PhantomData,
-        })
+            state: State::ReqsPending(Vec::new()),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -150,22 +155,32 @@ where
 }
 
 #[doc(hidden)]
+#[pin_project]
 pub struct DistIterCollectHandle<
     T: Dist + ArrayOps,
     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
     pub(crate) distribution: Distribution,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    pub(crate) _phantom: PhantomData<A>,
+    state: State<T, A>,
 }
 
-impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
+enum State<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> {
+    ReqsPending(Vec<(usize, T)>),
+    Collecting(Pin<Box<dyn Future<Output = A>>>),
+}
+
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
     DistIterCollectHandle<T, A>
 {
-    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
-        let input = (local_vals, self.distribution);
-        let array: A = AsyncTeamInto::team_into(input, &self.team).await;
+    async fn async_create_array(
+        local_vals: Vec<T>,
+        dist: Distribution,
+        team: Pin<Arc<LamellarTeamRT>>,
+    ) -> A {
+        let input = (local_vals, dist);
+        let array: A = AsyncTeamInto::team_into(input, &team).await;
         array
     }
 
@@ -175,26 +190,78 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
         array
     }
 }
-#[async_trait]
-impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
+
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static> Future
     for DistIterCollectHandle<T, A>
 {
     type Output = A;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(ref mut vals) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if req.ready_or_set_waker(cx.waker()) {
+                        vals.extend(req.val());
+                    } else {
+                        //still need to wait on this req
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                }
+                vals.sort_by(|a, b| a.0.cmp(&b.0));
+                let local_vals = vals.into_iter().map(|v| v.1).collect();
+                let mut collect = Box::pin(Self::async_create_array(
+                    local_vals,
+                    *this.distribution,
+                    this.team.clone(),
+                ));
+
+                match Future::poll(collect.as_mut(), cx) {
+                    Poll::Ready(a) => {
+                        return Poll::Ready(a);
+                    }
+                    Poll::Pending => {
+                        *this.state = State::Collecting(collect);
+                        return Poll::Pending;
+                    }
+                }
+            }
+            State::Collecting(collect) => {
+                let a = ready!(Future::poll(collect.as_mut(), cx));
+                Poll::Ready(a)
+            }
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
+    LamellarRequest for DistIterCollectHandle<T, A>
+{
+    fn blocking_wait(mut self) -> Self::Output {
+        // let mut num_local_vals = 0;
         let mut temp_vals = vec![];
         for req in self.reqs.drain(0..) {
-            let v = req.into_future().await;
+            let v = req.blocking_wait();
             temp_vals.extend(v);
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
-        let local_vals = temp_vals.into_iter().map(|v| v.1).collect::<Vec<_>>();
-        self.async_create_array(local_vals).await
+        let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
+        self.create_array(local_vals)
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+    fn val(&self) -> Self::Output {
         // let mut num_local_vals = 0;
         let mut temp_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.get();
+        for req in self.reqs.iter() {
+            let v = req.val();
             temp_vals.extend(v);
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 53d40f27..ed426ffa 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -1,17 +1,24 @@
 use crate::active_messaging::LamellarArcLocalAm;
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
+
+use crate::darc::DarcMode;
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::Darc;
 
 use async_trait::async_trait;
+use futures_util::{ready, Future};
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::{
     atomic::{AtomicUsize, Ordering},
     Arc,
 };
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct Count<I> {
@@ -33,6 +40,7 @@ where
     type AmOutput = usize;
     type Output = usize;
     type Item = I::Item;
+    type Handle = DistIterCountHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Count {
             iter: self.iter.init(start, cnt),
@@ -50,9 +58,13 @@ where
     fn create_handle(
         self,
         team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(RemoteIterCountHandle { reqs, team })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        DistIterCountHandle {
+            reqs,
+            team,
+            state: State::ReqsPending(0),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -60,9 +72,16 @@ where
 }
 
 #[doc(hidden)]
-pub struct RemoteIterCountHandle {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = usize>>>,
+#[pin_project]
+pub struct DistIterCountHandle {
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
     team: Pin<Arc<LamellarTeamRT>>,
+    state: State,
+}
+
+enum State {
+    ReqsPending(usize),
+    Counting(Pin<Box<dyn Future<Output = usize>>>),
 }
 
 #[lamellar_impl::AmDataRT]
@@ -78,20 +97,21 @@ impl LamellarAm for UpdateCntAm {
     }
 }
 
-impl RemoteIterCountHandle {
-    async fn async_reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
-        self.team
-            .exec_am_all(UpdateCntAm {
-                remote_cnt: local_cnt,
-                cnt: cnt.clone(),
-            })
-            .into_future()
-            .await;
-        self.team.async_barrier().await;
+impl DistIterCountHandle {
+    async fn async_reduce_remote_counts(local_cnt: usize, team: Pin<Arc<LamellarTeamRT>>) -> usize {
+        let cnt = Darc::async_try_new(&team, AtomicUsize::new(0), DarcMode::Darc)
+            .await
+            .unwrap();
+        team.exec_am_all(UpdateCntAm {
+            remote_cnt: local_cnt,
+            cnt: cnt.clone(),
+        })
+        .await;
+        team.async_barrier().await;
         cnt.load(Ordering::SeqCst)
     }
 
-    fn reduce_remote_counts(self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
+    fn reduce_remote_counts(&self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
         self.team.exec_am_all(UpdateCntAm {
             remote_cnt: local_cnt,
             cnt: cnt.clone(),
@@ -102,28 +122,69 @@ impl RemoteIterCountHandle {
     }
 }
 
+impl Future for DistIterCountHandle {
+    type Output = usize;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(cnt) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if !req.ready_or_set_waker(cx.waker()) {
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                    *cnt += req.val();
+                }
+                let mut global_cnt =
+                    Box::pin(Self::async_reduce_remote_counts(*cnt, this.team.clone()));
+                match Future::poll(global_cnt.as_mut(), cx) {
+                    Poll::Ready(count) => {
+                        return Poll::Ready(count);
+                    }
+                    Poll::Pending => {
+                        *this.state = State::Counting(global_cnt);
+                        Poll::Pending
+                    }
+                }
+            }
+            State::Counting(global_cnt) => {
+                let count = ready!(Future::poll(global_cnt.as_mut(), cx));
+                Poll::Ready(count)
+            }
+        }
+    }
+}
+
 #[doc(hidden)]
 #[async_trait]
-impl IterRequest for RemoteIterCountHandle {
-    type Output = usize;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        self.team.async_barrier().await;
+impl LamellarRequest for DistIterCountHandle {
+    fn blocking_wait(mut self) -> Self::Output {
+        self.team.tasking_barrier();
         let cnt = Darc::new(&self.team, AtomicUsize::new(0)).unwrap();
-        // all the requests should have already been launched, and we are just awaiting the results
-        let count = futures::future::join_all(self.reqs.drain(..).map(|req| req.into_future()))
-            .await
+        let count = self
+            .reqs
+            .drain(..)
+            .map(|req| req.blocking_wait())
             .into_iter()
             .sum::<usize>();
-        // println!("count: {} {:?}", count, std::thread::current().id());
-        self.async_reduce_remote_counts(count, cnt).await
+        self.reduce_remote_counts(count, cnt)
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+    fn val(&self) -> Self::Output {
         self.team.tasking_barrier();
         let cnt = Darc::new(&self.team, AtomicUsize::new(0)).unwrap();
         let count = self
             .reqs
-            .drain(..)
-            .map(|req| req.get())
+            .iter()
+            .map(|req| req.val())
             .into_iter()
             .sum::<usize>();
         self.reduce_remote_counts(count, cnt)
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index a45bbff9..5de9679c 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -1,14 +1,16 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use async_trait::async_trait;
-use futures::Future;
+use futures_util::Future;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct ForEach<I, F>
@@ -41,6 +43,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
+    type Handle = DistIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEach {
             iter: self.iter.init(start, cnt),
@@ -60,9 +63,9 @@ where
     fn create_handle(
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(DistIterForEachHandle { reqs })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        DistIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -104,6 +107,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
+    type Handle = DistIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEachAsync {
             iter: self.iter.init(start, cnt),
@@ -124,9 +128,9 @@ where
     fn create_handle(
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(DistIterForEachHandle { reqs })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        DistIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -152,21 +156,41 @@ where
 
 #[doc(hidden)]
 pub struct DistIterForEachHandle {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
 }
 
-#[doc(hidden)]
-#[async_trait]
-impl IterRequest for DistIterForEachHandle {
+impl Future for DistIterForEachHandle {
     type Output = ();
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.into_future().await;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        while let Some(mut req) = self.reqs.pop_front() {
+            if !req.ready_or_set_waker(cx.waker()) {
+                self.reqs.push_front(req);
+                return Poll::Pending;
+            }
         }
+        Poll::Ready(())
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+}
+
+#[doc(hidden)]
+impl LamellarRequest for DistIterForEachHandle {
+    fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(..) {
-            req.get();
+            req.blocking_wait();
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+    fn val(&self) -> Self::Output {
+        for req in self.reqs.iter() {
+            req.val();
         }
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 94fd66f8..348f31f4 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -2,15 +2,19 @@ use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::array::{ArrayOps, Distribution, UnsafeArray};
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::Dist;
 
-use async_trait::async_trait;
+use futures_util::{ready, Future, StreamExt};
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct Reduce<I, F> {
@@ -36,6 +40,7 @@ where
     type AmOutput = Option<I::Item>;
     type Output = Option<I::Item>;
     type Item = I::Item;
+    type Handle = DistIterReduceHandle<I::Item, F>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Reduce {
             iter: self.iter.init(start, cnt),
@@ -55,13 +60,14 @@ where
     fn create_handle(
         self,
         team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(RemoteIterReduceHandle {
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        DistIterReduceHandle {
             op: self.op,
             reqs,
             team,
-        })
+            state: State::ReqsPending(None),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -69,36 +75,52 @@ where
 }
 
 #[doc(hidden)]
-pub struct RemoteIterReduceHandle<T, F> {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Option<T>>>>,
+#[pin_project]
+pub struct DistIterReduceHandle<T, F> {
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Option<T>>>,
     pub(crate) op: F,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    state: State<T>,
 }
 
-impl<T, F> RemoteIterReduceHandle<T, F>
+enum State<T> {
+    ReqsPending(Option<T>),
+    Reducing(Pin<Box<dyn Future<Output = Option<T>>>>),
+}
+
+impl<T, F> DistIterReduceHandle<T, F>
 where
     T: Dist + ArrayOps,
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
-    async fn async_reduce_remote_vals(&self, local_val: Option<T>) -> Option<T> {
-        self.team.async_barrier().await;
+    async fn async_reduce_remote_vals(
+        local_val: T,
+        team: Pin<Arc<LamellarTeamRT>>,
+        op: F,
+    ) -> Option<T> {
         let local_vals =
-            UnsafeArray::<Option<T>>::new(&self.team, self.team.num_pes, Distribution::Block);
+            UnsafeArray::<T>::async_new(&team, team.num_pes, Distribution::Block).await;
         unsafe {
             local_vals.local_as_mut_slice()[0] = local_val;
         };
         local_vals.async_barrier().await;
-        let buffered_iter = unsafe { local_vals.buffered_onesided_iter(self.team.num_pes) };
-        buffered_iter
-            .into_iter()
-            .filter_map(|&res| res)
-            .reduce(self.op.clone())
+        let buffered_iter = unsafe { local_vals.buffered_onesided_iter(team.num_pes) };
+        let mut stream = buffered_iter.into_stream();
+        let first = stream.next().await?;
+
+        Some(
+            stream
+                .fold(*first, |a, &b| {
+                    let val = op(a, b);
+                    async move { val }
+                })
+                .await,
+        )
     }
 
-    fn reduce_remote_vals(&self, local_val: Option<T>) -> Option<T> {
+    fn reduce_remote_vals(&self, local_val: T) -> Option<T> {
         self.team.tasking_barrier();
-        let local_vals =
-            UnsafeArray::<Option<T>>::new(&self.team, self.team.num_pes, Distribution::Block);
+        let local_vals = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
         unsafe {
             local_vals.local_as_mut_slice()[0] = local_val;
         };
@@ -106,34 +128,99 @@ where
         let buffered_iter = unsafe { local_vals.buffered_onesided_iter(self.team.num_pes) };
         buffered_iter
             .into_iter()
-            .filter_map(|&res| res)
+            .map(|&x| x)
             .reduce(self.op.clone())
     }
 }
 
-#[doc(hidden)]
-#[async_trait]
-impl<T, F> IterRequest for RemoteIterReduceHandle<T, F>
+impl<T, F> Future for DistIterReduceHandle<T, F>
 where
     T: Dist + ArrayOps,
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     type Output = Option<T>;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        let local_val = futures::future::join_all(self.reqs.drain(..).map(|req| req.into_future()))
-            .await
-            .into_iter()
-            .filter_map(|res| res)
-            .reduce(self.op.clone());
-        self.async_reduce_remote_vals(local_val).await
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(mut val) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if !req.ready_or_set_waker(cx.waker()) {
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                    match val {
+                        None => val = req.val(),
+                        Some(val1) => {
+                            if let Some(val2) = req.val() {
+                                val = Some((this.op)(val1, val2));
+                            }
+                        }
+                    }
+                }
+                if let Some(val) = val {
+                    let mut reducing = Box::pin(Self::async_reduce_remote_vals(
+                        val.clone(),
+                        this.team.clone(),
+                        this.op.clone(),
+                    ));
+                    match Future::poll(reducing.as_mut(), cx) {
+                        Poll::Ready(val) => Poll::Ready(val),
+                        Poll::Pending => {
+                            *this.state = State::Reducing(reducing);
+                            Poll::Pending
+                        }
+                    }
+                } else {
+                    Poll::Ready(None)
+                }
+            }
+            State::Reducing(reducing) => {
+                let val = ready!(Future::poll(reducing.as_mut(), cx));
+                Poll::Ready(val)
+            }
+        }
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+}
+
+#[doc(hidden)]
+impl<T, F> LamellarRequest for DistIterReduceHandle<T, F>
+where
+    T: Dist + ArrayOps,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
+    fn blocking_wait(mut self) -> Self::Output {
         let local_val = self
             .reqs
             .drain(..)
-            .filter_map(|req| req.get())
+            .filter_map(|req| req.blocking_wait())
+            .reduce(self.op.clone());
+        if let Some(val) = local_val {
+            self.reduce_remote_vals(val)
+        } else {
+            None
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+
+    fn val(&self) -> Self::Output {
+        let local_val = self
+            .reqs
+            .iter()
+            .filter_map(|req| req.val())
             .reduce(self.op.clone());
-        self.reduce_remote_vals(local_val)
+        if let Some(val) = local_val {
+            self.reduce_remote_vals(val)
+        } else {
+            None
+        }
     }
 }
 
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index 0d260a54..c9cd6b31 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -1,16 +1,18 @@
 use crate::active_messaging::LamellarArcLocalAm;
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
-use crate::array::iterator::one_sided_iterator::OneSidedIterator;
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::array::{ArrayOps, Distribution, UnsafeArray};
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::Dist;
-
-use async_trait::async_trait;
+use futures_util::{ready, Future};
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct Sum<I> {
@@ -33,6 +35,7 @@ where
     type AmOutput = I::Item;
     type Output = I::Item;
     type Item = I::Item;
+    type Handle = DistIterSumHandle<I::Item>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Sum {
             iter: self.iter.init(start, cnt),
@@ -50,9 +53,13 @@ where
     fn create_handle(
         self,
         team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(RemoteIterSumHandle { reqs, team })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        DistIterSumHandle {
+            reqs,
+            team,
+            state: State::ReqsPending(None),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -60,22 +67,32 @@ where
 }
 
 #[doc(hidden)]
-pub struct RemoteIterSumHandle<T> {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = T>>>,
+#[pin_project]
+pub struct DistIterSumHandle<T> {
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    state: State<T>,
 }
 
-impl<T> RemoteIterSumHandle<T>
+enum State<T> {
+    ReqsPending(Option<T>),
+    Summing(Pin<Box<dyn Future<Output = T>>>),
+}
+
+impl<T> DistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
 {
-    async fn async_reduce_remote_vals(&self, local_sum: T, local_sums: UnsafeArray<T>) -> T {
+    async fn async_reduce_remote_vals(local_sum: T, team: Pin<Arc<LamellarTeamRT>>) -> T {
+        let local_sums =
+            UnsafeArray::<T>::async_new(&team, team.num_pes, Distribution::Block).await;
         unsafe {
             local_sums.local_as_mut_slice()[0] = local_sum;
         };
         local_sums.async_barrier().await;
-        let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
-        buffered_iter.into_iter().map(|&e| e).sum()
+        // let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
+        // buffered_iter.into_iter().map(|&e| e).sum()
+        unsafe { local_sums.sum().await }
     }
 
     fn reduce_remote_vals(&self, local_sum: T, local_sums: UnsafeArray<T>) -> T {
@@ -83,30 +100,83 @@ where
             local_sums.local_as_mut_slice()[0] = local_sum;
         };
         local_sums.tasking_barrier();
-        let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
-        buffered_iter.into_iter().map(|&e| e).sum()
+        // let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
+        // buffered_iter.into_iter().map(|&e| e).sum()
+        unsafe { local_sums.sum().blocking_wait() }
     }
 }
 
-#[doc(hidden)]
-#[async_trait]
-impl<T> IterRequest for RemoteIterSumHandle<T>
+impl<T> Future for DistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
 {
     type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        self.team.async_barrier().await;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(local_sum) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if !req.ready_or_set_waker(cx.waker()) {
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                    match local_sum {
+                        Some(sum) => {
+                            *sum = [*sum, req.val()].into_iter().sum();
+                        }
+                        None => {
+                            *local_sum = Some(req.val());
+                        }
+                    }
+                }
+                let mut sum = Box::pin(Self::async_reduce_remote_vals(
+                    local_sum.unwrap(),
+                    this.team.clone(),
+                ));
+                match Future::poll(sum.as_mut(), cx) {
+                    Poll::Ready(local_sum) => Poll::Ready(local_sum),
+                    Poll::Pending => {
+                        *this.state = State::Summing(sum);
+                        Poll::Pending
+                    }
+                }
+            }
+            State::Summing(sum) => {
+                let local_sum = ready!(Future::poll(sum.as_mut(), cx));
+                Poll::Ready(local_sum)
+            }
+        }
+    }
+}
+#[doc(hidden)]
+impl<T> LamellarRequest for DistIterSumHandle<T>
+where
+    T: Dist + ArrayOps + std::iter::Sum,
+{
+    fn blocking_wait(mut self) -> Self::Output {
         let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
-        let local_sum = futures::future::join_all(self.reqs.drain(..).map(|req| req.into_future()))
-            .await
+        let local_sum = self
+            .reqs
+            .drain(..)
+            .map(|req| req.blocking_wait())
             .into_iter()
             .sum();
-        self.async_reduce_remote_vals(local_sum, local_sums).await
+        self.reduce_remote_vals(local_sum, local_sums)
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+
+    fn val(&self) -> Self::Output {
         let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
-        let local_sum = self.reqs.drain(..).map(|req| req.get()).into_iter().sum();
+        let local_sum = self.reqs.iter().map(|req| req.val()).into_iter().sum();
         self.reduce_remote_vals(local_sum, local_sums)
     }
 }
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index df51405e..9c51120b 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -42,7 +42,7 @@ use crate::LamellarTeamRT;
 use crate::active_messaging::SyncSend;
 
 use enum_dispatch::enum_dispatch;
-use futures::Future;
+use futures_util::Future;
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -50,7 +50,7 @@ use std::sync::Arc;
 #[doc(hidden)]
 #[enum_dispatch]
 pub trait LocalIteratorLauncher {
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static;
@@ -60,16 +60,12 @@ pub trait LocalIteratorLauncher {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static;
 
-    fn for_each_async<I, F, Fut>(
-        &self,
-        iter: &I,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -80,17 +76,13 @@ pub trait LocalIteratorLauncher {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
         Fut: Future<Output = ()> + Send + 'static;
 
-    fn reduce<I, F>(
-        &self,
-        iter: &I,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -101,27 +93,13 @@ pub trait LocalIteratorLauncher {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static;
 
-    // fn reduce_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static;
-
-    // fn reduce_async_with_schedule<I, F, Fut>(&self, sched: Schedule, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static;
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -132,57 +110,26 @@ pub trait LocalIteratorLauncher {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
 
-    // fn collect_async<I, A, B>(
-    //     &self,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static;
-
-    // fn collect_async_with_schedule<I, A, B>(
-    //         &self,
-    //         sched: Schedule,
-    //         iter: &I,
-    //         d: Distribution,
-    //     ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    //     where
-    //         I: LocalIterator + 'static,
-    //        I::Item: Future<Output = B> + Send  + 'static,
-    //         B: Dist + ArrayOps,
-    //         A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static;
-
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static;
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static;
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum;
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum;
@@ -381,7 +328,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///         .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     /// );
     ///```
-    fn for_each<F>(&self, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<F>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
     {
@@ -403,11 +350,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// array.local_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
     /// array.wait_all();
     ///```
-    fn for_each_with_schedule<F>(
-        &self,
-        sched: Schedule,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
     {
@@ -444,7 +387,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
-    fn for_each_async<F, Fut>(&self, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<F, Fut>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
         Fut: Future<Output = ()> + Send + 'static,
@@ -474,11 +417,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// });
     /// array.wait_all();
     ///```
-    fn for_each_async_with_schedule<F, Fut>(
-        &self,
-        sched: Schedule,
-        op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
         Fut: Future<Output = ()> + Send + 'static,
@@ -500,7 +439,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn reduce<F>(&self, op: F) -> Pin<Box<dyn Future<Output = Option<Self::Item>> + Send>>
+    fn reduce<F>(&self, op: F) -> LocalIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: SyncSend,
@@ -527,7 +466,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         &self,
         sched: Schedule,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<Self::Item>> + Send>>
+    ) -> LocalIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: SyncSend,
@@ -575,7 +514,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
-    fn collect<A>(&self, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<A>(&self, d: Distribution) -> LocalIterCollectHandle<Self::Item, A>
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
@@ -603,7 +542,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         &self,
         sched: Schedule,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<Self::Item, A>
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
@@ -653,7 +592,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().count();
     /// let cnt = array.block_on(req);
     ///```
-    fn count(&self) -> Pin<Box<dyn Future<Output = usize> + Send>> {
+    fn count(&self) -> LocalIterCountHandle {
         self.array().count(self)
     }
 
@@ -671,7 +610,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().count_with_schedule(Scheduler::Dynamic);
     /// let cnt = array.block_on(req);
     ///```
-    fn count_with_schedule(&self, sched: Schedule) -> Pin<Box<dyn Future<Output = usize> + Send>> {
+    fn count_with_schedule(&self, sched: Schedule) -> LocalIterCountHandle {
         self.array().count_with_schedule(sched, self)
     }
 
@@ -693,7 +632,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    fn sum(&self) -> Pin<Box<dyn Future<Output = Self::Item> + Send>>
+    fn sum(&self) -> LocalIterSumHandle<Self::Item>
     where
         Self::Item: SyncSend + std::iter::Sum,
     {
@@ -718,7 +657,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req);
     ///```
-    fn sum_with_schedule(&self, sched: Schedule) -> Pin<Box<dyn Future<Output = Self::Item> + Send>>
+    fn sum_with_schedule(&self, sched: Schedule) -> LocalIterSumHandle<Self::Item>
     where
         Self::Item: SyncSend + std::iter::Sum,
     {
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 05897c92..ae96c6e6 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -1,17 +1,21 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::array::operations::ArrayOps;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 
-use async_trait::async_trait;
 use core::marker::PhantomData;
+use futures_util::{ready, Future};
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct Collect<I, A> {
@@ -39,6 +43,7 @@ where
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
     type Item = (usize, I::Item);
+    type Handle = LocalIterCollectHandle<I::Item, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Collect {
             iter: self.iter.init(start, cnt),
@@ -59,111 +64,123 @@ where
     fn create_handle(
         self,
         team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(LocalIterCollectHandle {
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        LocalIterCollectHandle {
             reqs,
             distribution: self.distribution,
             team,
-            _phantom: self._phantom,
-        })
+            state: State::ReqsPending(Vec::new()),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
     }
 }
 
-// impl<I,A> MonotonicIterConsumer for Collect<I,A>
-// where
-//     I: LocalIterator,
-//     I::Item: Dist + ArrayOps,
-//     A: for<'a> TeamFrom<(&'a Vec<I::Item>,Distribution)> + SyncSend  + Clone +  'static,{
-//     fn monotonic<J: IterConsumer>(&self) -> J {
-//         Collect{
-//             iter: self.iter.monotonic(),
-//             distribution: self.distribution.clone(),
-//             _phantom: self._phantom.clone(),
-//         }
-//     }
-// }
-
-// #[derive(Clone,Debug)]
-// pub struct CollectAsync<I,A,B>{
-//     pub(crate) iter: I,
-//     pub(crate) distribution: Distribution,
-//     pub(crate) _phantom: PhantomData<(A,B)>
-// }
-
-// impl<I,A,B> IterConsumer for CollectAsync<I,A,B>
-// where
-//     I: LocalIterator,
-//    I::Item: Future<Output = B> + Send  + 'static,
-//     B: Dist + ArrayOps,
-//     A: From<UnsafeArray<B>> + SyncSend + Clone + 'static,{
-//     type AmOutput = Vec<(usize,B)>;
-//     type Output = A;
-//     fn into_am(self, schedule: IterSchedule) -> LamellarArcLocalAm {
-//         Arc::new(CollectAsyncAm{
-//             iter: self.iter,
-//             schedule
-//         })
-//     }
-//     fn create_handle(self, team: Pin<Arc<LamellarTeamRT>>, reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>) -> Box<dyn IterRequest<Output = Self::Output>> {
-//         Box::new(LocalIterCollectHandle {
-//             reqs,
-//             distribution: self.distribution,
-//             team,
-//             _phantom: PhantomData,
-//         })
-//     }
-//     fn max_elems(&self, in_elems: usize) -> usize{
-//         self.iter.elems(in_elems)
-//     }
-// }
-
 #[doc(hidden)]
+#[pin_project]
 pub struct LocalIterCollectHandle<
     T: Dist + ArrayOps,
     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
 > {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<(usize, T)>>>>,
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
     pub(crate) distribution: Distribution,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    pub(crate) _phantom: PhantomData<A>,
+    state: State<T, A>,
 }
 
-impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend>
+enum State<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> {
+    ReqsPending(Vec<(usize, T)>),
+    Collecting(Pin<Box<dyn Future<Output = A>>>),
+}
+
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
     LocalIterCollectHandle<T, A>
 {
-    async fn async_create_array(&self, local_vals: Vec<T>) -> A {
-        let input = (local_vals, self.distribution);
-        AsyncTeamInto::team_into(input, &self.team).await
+    async fn async_create_array(
+        local_vals: Vec<T>,
+        dist: Distribution,
+        team: Pin<Arc<LamellarTeamRT>>,
+    ) -> A {
+        let input = (local_vals, dist);
+        AsyncTeamInto::team_into(input, &team).await
     }
     fn create_array(&self, local_vals: Vec<T>) -> A {
         let input = (local_vals, self.distribution);
         TeamInto::team_into(input, &self.team)
     }
 }
-#[async_trait]
-impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> IterRequest
+
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static> Future
     for LocalIterCollectHandle<T, A>
 {
     type Output = A;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(vals) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if !req.ready_or_set_waker(cx.waker()) {
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    } else {
+                        vals.extend(req.val());
+                    }
+                }
+                vals.sort_by(|a, b| a.0.cmp(&b.0));
+                let local_vals = vals.into_iter().map(|v| v.1).collect();
+                let mut collect = Box::pin(Self::async_create_array(
+                    local_vals,
+                    this.distribution.clone(),
+                    this.team.clone(),
+                ));
+                match Future::poll(collect.as_mut(), cx) {
+                    Poll::Ready(a) => {
+                        return Poll::Ready(a);
+                    }
+                    Poll::Pending => {
+                        *this.state = State::Collecting(collect);
+                        return Poll::Pending;
+                    }
+                }
+            }
+            State::Collecting(collect) => {
+                let a = ready!(Future::poll(collect.as_mut(), cx));
+                return Poll::Ready(a);
+            }
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
+    LamellarRequest for LocalIterCollectHandle<T, A>
+{
+    fn blocking_wait(mut self) -> Self::Output {
+        // let mut num_local_vals = 0;
         let mut temp_vals = vec![];
         for req in self.reqs.drain(0..) {
-            let v = req.into_future().await;
+            let v = req.blocking_wait();
             temp_vals.extend(v);
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
         let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.async_create_array(local_vals).await
+        self.create_array(local_vals)
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+    fn val(&self) -> Self::Output {
         // let mut num_local_vals = 0;
         let mut temp_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.get();
+        for req in self.reqs.iter() {
+            let v = req.val();
             temp_vals.extend(v);
         }
         temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index db6c25c3..9482dee2 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -1,13 +1,16 @@
 use crate::active_messaging::LamellarArcLocalAm;
 use crate::array::iterator::local_iterator::LocalIterator;
-use crate::array::iterator::IterRequest;
 use crate::array::iterator::{consumer::*, private::*};
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use async_trait::async_trait;
+use futures_util::Future;
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct Count<I> {
@@ -29,6 +32,7 @@ where
     type AmOutput = usize;
     type Output = usize;
     type Item = I::Item;
+    type Handle = LocalIterCountHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Count {
             iter: self.iter.init(start, cnt),
@@ -46,9 +50,12 @@ where
     fn create_handle(
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(LocalIterCountHandle { reqs })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> LocalIterCountHandle {
+        LocalIterCountHandle {
+            reqs,
+            state: State::ReqsPending(0),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -56,26 +63,57 @@ where
 }
 
 #[doc(hidden)]
+#[pin_project]
 pub struct LocalIterCountHandle {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = usize>>>,
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
+    state: State,
 }
 
-#[doc(hidden)]
-#[async_trait]
-impl IterRequest for LocalIterCountHandle {
+enum State {
+    ReqsPending(usize),
+}
+
+impl Future for LocalIterCountHandle {
     type Output = usize;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        let count = futures::future::join_all(self.reqs.drain(..).map(|req| req.into_future()))
-            .await
-            .into_iter()
-            .sum::<usize>();
-        // println!("count: {} {:?}", count, std::thread::current().id());
-        count
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(cnt) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if !req.ready_or_set_waker(cx.waker()) {
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                    *cnt += req.val();
+                }
+                Poll::Ready(*cnt)
+            }
+        }
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+}
+
+#[doc(hidden)]
+impl LamellarRequest for LocalIterCountHandle {
+    fn blocking_wait(mut self) -> Self::Output {
         self.reqs
             .drain(..)
-            .map(|req| req.get())
+            .map(|req| req.blocking_wait())
+            .into_iter()
+            .sum::<usize>()
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+    fn val(&self) -> Self::Output {
+        self.reqs
+            .iter()
+            .map(|req| req.val())
             .into_iter()
             .sum::<usize>()
     }
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index 479639c0..7abdc16c 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -1,14 +1,16 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use async_trait::async_trait;
-use futures::Future;
+use futures_util::Future;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct ForEach<I, F>
@@ -41,6 +43,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
+    type Handle = LocalIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         // println!("ForEach before init start {:?} cnt {:?}", start,cnt);
         let iter = ForEach {
@@ -63,9 +66,9 @@ where
     fn create_handle(
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(LocalIterForEachHandle { reqs })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        LocalIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -107,6 +110,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
+    type Handle = LocalIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEachAsync {
             iter: self.iter.init(start, cnt),
@@ -127,9 +131,9 @@ where
     fn create_handle(
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(LocalIterForEachHandle { reqs })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        LocalIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -152,21 +156,41 @@ where
 
 #[doc(hidden)]
 pub struct LocalIterForEachHandle {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
 }
 
-#[doc(hidden)]
-#[async_trait]
-impl IterRequest for LocalIterForEachHandle {
+impl Future for LocalIterForEachHandle {
     type Output = ();
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.into_future().await;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        while let Some(mut req) = self.reqs.pop_front() {
+            if !req.ready_or_set_waker(cx.waker()) {
+                self.reqs.push_front(req);
+                return Poll::Pending;
+            }
         }
+        Poll::Ready(())
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+}
+
+#[doc(hidden)]
+impl LamellarRequest for LocalIterForEachHandle {
+    fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(..) {
-            req.get();
+            req.blocking_wait();
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+    fn val(&self) -> Self::Output {
+        for req in self.reqs.iter() {
+            req.val();
         }
     }
 }
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index 51d4d71b..a10a5113 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -1,13 +1,17 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use async_trait::async_trait;
+use futures_util::Future;
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub struct Reduce<I, F> {
@@ -33,6 +37,7 @@ where
     type AmOutput = Option<I::Item>;
     type Output = Option<I::Item>;
     type Item = I::Item;
+    type Handle = LocalIterReduceHandle<I::Item, F>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Reduce {
             iter: self.iter.init(start, cnt),
@@ -52,78 +57,89 @@ where
     fn create_handle(
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(LocalIterReduceHandle { op: self.op, reqs })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        LocalIterReduceHandle {
+            op: self.op,
+            reqs,
+            state: State::ReqsPending(None),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
     }
 }
 
-// #[derive(Clone, Debug)]
-// pub struct ReduceAsync<I, F, Fut> {
-//     pub(crate) iter: I,
-//     pub(crate) op: F,
-//     pub(crate) _phantom: PhantomData<Fut>,
-// }
-
-// impl<I, F, Fut> IterConsumer for ReduceAsync<I, F, Fut>
-// where
-//     I: LocalIterator + 'static,
-//     I::Item: SyncSend,
-//     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static,
-// {
-//     type AmOutput = Option<I::Item>;
-//     type Output = Option<I::Item>;
-//     fn into_am(self, schedule: IterSchedule) -> LamellarArcLocalAm {
-//         Arc::new(ReduceAsyncAm {
-//             iter: self.iter,
-//             op: self.op,
-//             schedule,
-//             _phantom: self._phantom,
-//         })
-//     }
-//     fn create_handle(
-//         self,
-//         team: Pin<Arc<LamellarTeamRT>>,
-//         reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-//     ) -> Box<dyn IterRequest<Output = Self::Output>> {
-//         Box::new(LocalIterReduceHandle { op: self.op, reqs })
-//     }
-//     fn max_elems(&self, in_elems: usize) -> usize {
-//         self.iter.elems(in_elems)
-//     }
-// }
-
 #[doc(hidden)]
+#[pin_project]
 pub struct LocalIterReduceHandle<T, F> {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Option<T>>>>,
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Option<T>>>,
     pub(crate) op: F,
+    state: State<T>,
 }
 
-#[doc(hidden)]
-#[async_trait]
-impl<T, F> IterRequest for LocalIterReduceHandle<T, F>
+enum State<T> {
+    ReqsPending(Option<T>),
+}
+
+impl<T, F> Future for LocalIterReduceHandle<T, F>
 where
-    T: SyncSend,
+    T: SyncSend + Copy + 'static,
     F: Fn(T, T) -> T + SyncSend + 'static,
 {
     type Output = Option<T>;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        futures::future::join_all(self.reqs.drain(..).map(|req| req.into_future()))
-            .await
-            .into_iter()
-            .filter_map(|res| res)
-            .reduce(self.op)
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(val) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if !req.ready_or_set_waker(cx.waker()) {
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                    match val {
+                        None => *val = req.val(),
+                        Some(val1) => {
+                            if let Some(val2) = req.val() {
+                                *val = Some((this.op)(*val1, val2));
+                            }
+                        }
+                    }
+                }
+                Poll::Ready(*val)
+            }
+        }
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+}
+
+#[doc(hidden)]
+impl<T, F> LamellarRequest for LocalIterReduceHandle<T, F>
+where
+    T: SyncSend + Copy + 'static,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
+    fn blocking_wait(mut self) -> Self::Output {
         self.reqs
             .drain(..)
-            .filter_map(|req| req.get())
+            .filter_map(|req| req.blocking_wait())
             .reduce(self.op)
     }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+
+    fn val(&self) -> Self::Output {
+        self.reqs
+            .iter()
+            .filter_map(|req| req.val())
+            .reduce(self.op.clone())
+    }
 }
 
 #[lamellar_impl::AmLocalDataRT(Clone)]
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index 076d99bf..4908e2db 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -1,13 +1,17 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
-use crate::array::iterator::{private::*, IterRequest};
+use crate::array::iterator::private::*;
 use crate::lamellar_request::LamellarRequest;
+use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use async_trait::async_trait;
+use futures_util::Future;
+use pin_project::pin_project;
+use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Sum<I> {
@@ -30,6 +34,7 @@ where
     type AmOutput = I::Item;
     type Output = I::Item;
     type Item = I::Item;
+    type Handle = LocalIterSumHandle<I::Item>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Sum {
             iter: self.iter.init(start, cnt),
@@ -47,9 +52,12 @@ where
     fn create_handle(
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
-        reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>,
-    ) -> Box<dyn IterRequest<Output = Self::Output>> {
-        Box::new(LocalIterSumHandle { reqs })
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        LocalIterSumHandle {
+            reqs,
+            state: State::ReqsPending(None),
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -57,29 +65,71 @@ where
 }
 
 #[doc(hidden)]
+#[pin_project]
 pub struct LocalIterSumHandle<T> {
-    pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = T>>>,
+    pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
+    state: State<T>,
 }
 
-#[doc(hidden)]
-#[async_trait]
-impl<T> IterRequest for LocalIterSumHandle<T>
+enum State<T> {
+    ReqsPending(Option<T>),
+}
+
+impl<T> Future for LocalIterSumHandle<T>
 where
-    T: SyncSend + std::iter::Sum,
+    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
 {
     type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        futures::future::join_all(self.reqs.drain(..).map(|req| req.into_future()))
-            .await
-            .into_iter()
-            .sum::<Self::Output>()
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match &mut this.state {
+            State::ReqsPending(local_sum) => {
+                while let Some(mut req) = this.reqs.pop_front() {
+                    if !req.ready_or_set_waker(cx.waker()) {
+                        this.reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                    match local_sum {
+                        Some(sum) => {
+                            *sum = [sum, &req.val()].into_iter().sum::<T>();
+                        }
+                        None => {
+                            *local_sum = Some(req.val());
+                        }
+                    }
+                }
+
+                Poll::Ready(local_sum.take().expect("Value should be Present"))
+            }
+        }
     }
-    fn wait(mut self: Box<Self>) -> Self::Output {
+}
+
+#[doc(hidden)]
+impl<T> LamellarRequest for LocalIterSumHandle<T>
+where
+    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+{
+    fn blocking_wait(mut self) -> Self::Output {
         self.reqs
             .drain(..)
-            .map(|req| req.get())
+            .map(|req| req.blocking_wait())
             .sum::<Self::Output>()
     }
+
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        for req in self.reqs.iter_mut() {
+            if !req.ready_or_set_waker(waker) {
+                //only need to wait on the next unready req
+                return false;
+            }
+        }
+        true
+    }
+
+    fn val(&self) -> Self::Output {
+        self.reqs.iter().map(|req| req.val()).sum::<Self::Output>()
+    }
 }
 
 #[lamellar_impl::AmLocalDataRT(Clone)]
diff --git a/src/array/iterator/mod.rs b/src/array/iterator/mod.rs
index a172db5c..9ee9a4b2 100644
--- a/src/array/iterator/mod.rs
+++ b/src/array/iterator/mod.rs
@@ -9,15 +9,13 @@ pub mod consumer;
 
 use crate::memregion::Dist;
 
-use async_trait::async_trait;
-
-#[doc(hidden)]
-#[async_trait]
-pub trait IterRequest {
-    type Output;
-    async fn into_future(mut self: Box<Self>) -> Self::Output;
-    fn wait(self: Box<Self>) -> Self::Output;
-}
+// #[doc(hidden)]
+// #[async_trait]
+// pub trait IterRequest {
+//     type Output;
+//     async fn into_future(mut self: Box<Self>) -> Self::Output;
+//     fn wait(self: Box<Self>) -> Self::Output;
+// }
 
 pub(crate) mod private {
     pub struct Sealed;
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index 7265bcd7..bb43c1cf 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -25,14 +25,15 @@ use zip::*;
 // mod buffered;
 // use buffered::*;
 
-use crate::array::{LamellarArray, LamellarArrayInternalGet, LamellarArrayRequest};
+use crate::array::{ArrayRdmaHandle, LamellarArray, LamellarArrayInternalGet};
+use crate::lamellar_request::LamellarRequest;
 use crate::memregion::{Dist, OneSidedMemoryRegion, RegisteredMemoryRegion, SubRegion};
 
 use crate::LamellarTeamRT;
 
 // use async_trait::async_trait;
-// use futures::{ready, Stream};
-use futures::Stream;
+// use futures_util::{ready, Stream};
+use futures_util::Stream;
 use pin_project::pin_project;
 use std::marker::PhantomData;
 use std::pin::Pin;
@@ -273,6 +274,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
                 std::backtrace::Backtrace::capture()
             )
         }
+        // println!("Into Iter");
         self.init();
         OneSidedIteratorIter { iter: self }
     }
@@ -281,6 +283,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     where
         Self: Sized + Send,
     {
+        // println!("Into Stream");
         self.init();
         OneSidedStream { iter: self }
     }
@@ -383,7 +386,7 @@ pub struct OneSidedIter<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> {
 
 pub(crate) enum State {
     // Ready,
-    Pending(Box<dyn LamellarArrayRequest<Output = ()>>),
+    Pending(ArrayRdmaHandle),
     Buffered,
     Finished,
 }
@@ -432,60 +435,29 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
     type Array = A;
 
     fn init(&mut self) {
+        // println!(
+        //     "Iter init: index: {:?} buf_len {:?} array_len {:?}",
+        //     self.index,
+        //     self.buf_0.len(),
+        //     self.array.len()
+        // );
         let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
         self.state = State::Pending(req);
     }
 
-    // fn next(&mut self) -> Option<Self::Item> {
-    //     let mut cur_state = State::Finished;
-    //     std::mem::swap(&mut self.state, &mut cur_state);
-    //     match cur_state {
-    //         State::Pending(req) => {
-    //             req.wait(); //need to wait here because we use the same underlying buffer
-    //             if self.index + 1 < self.array.len() {
-    //                 // still have remaining elements
-    //                 self.index += 1;
-    //                 let buf_index = self.buf_index as isize;
-    //                 self.buf_index += 1;
-    //                 if self.buf_index == self.buf_0.len() {
-    //                     //prefetch the next data
-    //                     self.buf_index = 0;
-    //                     if self.index + self.buf_0.len() < self.array.len() {
-    //                         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
-    //                         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
-    //                         let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
-    //                         self.state = State::Pending(req);
-    //                     } else if self.index < self.array.len() {
-    //                         let sub_region =
-    //                             self.buf_0.sub_region(0..(self.array.len() - self.index));
-    //                         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
-    //                         // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
-    //                         // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
-    //                         let req = unsafe { self.array.internal_get(self.index, sub_region) };
-    //                         self.state = State::Pending(req);
-    //                     } else {
-    //                         self.state = State::Finished;
-    //                     }
-    //                 }
-    //             } else {
-    //                 self.state = State::Finished;
-    //             };
-    //             unsafe { self.ptr.0.as_ptr().offset(buf_index).as_ref() } //this is an option
-    //         }
-    //         State::Buffered => {
-    //             self.state = State::Finished;
-    //             unsafe { self.ptr.0.as_ptr().offset(self.buf_index as isize).as_ref() }
-    //         }
-    //         State::Finished => None,
-    //     }
-    // }
-
     fn next(&mut self) -> Option<Self::Item> {
         let mut cur_state = State::Finished;
         std::mem::swap(&mut self.state, &mut cur_state);
         match cur_state {
             State::Pending(req) => {
-                req.wait();
+                req.blocking_wait();
+                // println!(
+                //     "req ready  pending->buffered: index {} buf_index {} array_len {} buf_0_len {}",
+                //     self.index,
+                //     self.buf_index,
+                //     self.array.len(),
+                //     self.buf_0.len()
+                // );
                 self.state = State::Buffered;
                 self.index += 1;
                 self.buf_index += 1;
@@ -498,28 +470,51 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
                 }
             }
             State::Buffered => {
+                // println!(
+                //     "req ready buffered: index {} buf_index {} array_len {} buf_0_len {}",
+                //     self.index,
+                //     self.buf_index,
+                //     self.array.len(),
+                //     self.buf_0.len()
+                // );
                 //once here the we never go back to pending
                 if self.index < self.array.len() {
+                    self.state = State::Buffered;
                     if self.buf_index == self.buf_0.len() {
                         //need to get new data
                         self.buf_index = 0;
                         if self.index + self.buf_0.len() < self.array.len() {
+                            // println!(
+                            //     "full buffering more elements from array: index {} len {}",
+                            //     self.index,
+                            //     self.buf_0.len()
+                            // );
                             // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
                             // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
                             unsafe {
-                                self.array.internal_get(self.index, &self.buf_0).wait();
+                                self.array
+                                    .internal_get(self.index, &self.buf_0)
+                                    .blocking_wait();
                             }
                         } else {
                             let sub_region =
                                 self.buf_0.sub_region(0..(self.array.len() - self.index));
+                            // println!(
+                            //     "partial buffering more elements from array: index {} len {}",
+                            //     self.index,
+                            //     sub_region.len()
+                            // );
                             // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
                             // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
                             // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
                             unsafe {
-                                self.array.internal_get(self.index, sub_region).wait();
+                                self.array
+                                    .internal_get(self.index, sub_region)
+                                    .blocking_wait();
                             }
                         }
                     }
+
                     self.index += 1;
                     self.buf_index += 1;
                     unsafe {
@@ -530,11 +525,25 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
                             .as_ref()
                     }
                 } else {
+                    // println!(
+                    //     "finished1: index {} buf_index {} array_len {} ",
+                    //     self.index,
+                    //     self.buf_index,
+                    //     self.array.len()
+                    // );
                     self.state = State::Finished;
                     None
                 }
             }
-            State::Finished => None,
+            State::Finished => {
+                // println!(
+                //     "finished2: index {} buf_index {} array_len {} ",
+                //     self.index,
+                //     self.buf_index,
+                //     self.array.len()
+                // );
+                None
+            }
         }
     }
 
@@ -543,11 +552,14 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
         std::mem::swap(&mut self.state, &mut cur_state);
         let res = match cur_state {
             State::Pending(mut req) => {
-                if !req.ready() {
-                    req.set_waker(cx.waker().clone());
+                if !req.ready_or_set_waker(cx.waker()) {
                     self.state = State::Pending(req);
                     return Poll::Pending;
                 } else {
+                    // println!(
+                    //     "req ready  pending->buffered: index {} buf_index {} array_len {} buf_0_len {}",
+                    //     self.index, self.buf_index, self.array.len(), self.buf_0.len()
+                    // );
                     self.state = State::Buffered;
                     self.index += 1;
                     self.buf_index += 1;
@@ -561,27 +573,46 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
                 }
             }
             State::Buffered => {
+                // println!(
+                //     "req ready buffered: index {} buf_index {} array_len {} buf_0_len {}",
+                //     self.index,
+                //     self.buf_index,
+                //     self.array.len(),
+                //     self.buf_0.len()
+                // );
                 if self.index < self.array.len() {
+                    self.state = State::Buffered;
                     if self.buf_index == self.buf_0.len() {
                         //need to get new data
                         self.buf_index = 0;
                         let mut req = if self.index + self.buf_0.len() < self.array.len() {
+                            // println!(
+                            //     "full buffering more elements from array: index {} len {}",
+                            //     self.index,
+                            //     self.buf_0.len()
+                            // );
                             // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
                             // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
                             unsafe { self.array.internal_get(self.index, &self.buf_0) }
                         } else {
                             let sub_region =
                                 self.buf_0.sub_region(0..(self.array.len() - self.index));
+                            // println!(
+                            //     "partial buffering more elements from array: index {} len {}",
+                            //     self.index,
+                            //     sub_region.len()
+                            // );
                             // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
                             // but safe with respect to the buf_0 as we have consumed all its content and this is the only reference
                             // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
                             unsafe { self.array.internal_get(self.index, sub_region) }
                         };
-                        req.set_waker(cx.waker().clone());
+                        req.ready_or_set_waker(cx.waker());
                         self.state = State::Pending(req);
 
                         return Poll::Pending;
                     }
+
                     self.index += 1;
                     self.buf_index += 1;
                     unsafe {
@@ -592,11 +623,21 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
                             .as_ref()
                     }
                 } else {
+                    // println!(
+                    //     "finished: index {} buf_index {}",
+                    //     self.index, self.buf_index
+                    // );
                     self.state = State::Finished;
                     None
                 }
             }
-            State::Finished => None,
+            State::Finished => {
+                // println!(
+                //     "finished: index {} buf_index {}",
+                //     self.index, self.buf_index
+                // );
+                None
+            }
         };
         Poll::Ready(res)
     }
@@ -607,7 +648,7 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
     //     let res = match cur_state {
     //         State::Pending(mut req) => {
     //             if !req.ready() {
-    //                 req.set_waker(cx.waker().clone());
+    //                 req.set_waker(cx.waker());
     //                 self.state = State::Pending(req);
     //                 return Poll::Pending;
     //             }
@@ -709,7 +750,7 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
     // fn buffered_next(
     //     &mut self,
     //     mem_region: OneSidedMemoryRegion<u8>,
-    // ) -> Option<Box<dyn LamellarArrayRequest<Output = ()>>> {
+    // ) -> Option<ArrayRdmaHandle> {
     //     if self.index < self.array.len() {
     //         let mem_reg_t = unsafe { mem_region.to_base::<Self::ElemType>() };
     //         let req = self.array.internal_get(self.index, &mem_reg_t);
diff --git a/src/array/iterator/one_sided_iterator/buffered.rs b/src/array/iterator/one_sided_iterator/buffered.rs
index 0a702176..8aa0e8c4 100644
--- a/src/array/iterator/one_sided_iterator/buffered.rs
+++ b/src/array/iterator/one_sided_iterator/buffered.rs
@@ -5,7 +5,7 @@ use std::collections::VecDeque;
 use std::ops::Deref;
 
 use async_trait::async_trait;
-// use futures::Future;
+// use futures_util::Future;
 use pin_project::pin_project;
 #[pin_project]
 pub struct Buffered<I>
@@ -17,13 +17,7 @@ where
     index: usize,
     buf_index: usize,
     buf_size: usize,
-    reqs: VecDeque<
-        Option<(
-            usize,
-            Box<dyn LamellarArrayRequest<Output = ()>>,
-            OneSidedMemoryRegion<u8>,
-        )>,
-    >,
+    reqs: VecDeque<Option<(usize, ArrayRdmaHandle, OneSidedMemoryRegion<u8>)>>,
     state: BufferedState,
 }
 
@@ -139,10 +133,7 @@ where
         self.iter.item_size()
     }
     //im not actually sure what to do if another buffered iter is called after this one
-    fn buffered_next(
-        &mut self,
-        mem_region: OneSidedMemoryRegion<u8>,
-    ) -> Option<Box<dyn LamellarArrayRequest<Output = ()>>> {
+    fn buffered_next(&mut self, mem_region: OneSidedMemoryRegion<u8>) -> Option<ArrayRdmaHandle> {
         self.iter.buffered_next(mem_region)
     }
 
@@ -165,8 +156,8 @@ where
 //     }
 // }
 
-// use futures::task::{Context, Poll};
-// use futures::Stream;
+// use futures_util::task::{Context, Poll};
+// use futures_util::Stream;
 // use std::pin::Pin;
 
 // impl<I> Stream for Buffered<I>
diff --git a/src/array/iterator/one_sided_iterator/chunks.rs b/src/array/iterator/one_sided_iterator/chunks.rs
index 5ae011e8..0dac6bb2 100644
--- a/src/array/iterator/one_sided_iterator/chunks.rs
+++ b/src/array/iterator/one_sided_iterator/chunks.rs
@@ -1,12 +1,14 @@
 use crate::array::iterator::one_sided_iterator::{private::*, *};
 
+use crate::array::ArrayRdmaHandle;
+use crate::lamellar_request::LamellarRequest;
 // use crate::array::LamellarArrayRequest;
 // use crate::LamellarArray;
 use crate::memregion::OneSidedMemoryRegion;
 use pin_project::pin_project;
 
 // use async_trait::async_trait;
-// use futures::Future;
+// use futures_util::Future;
 #[pin_project]
 pub struct Chunks<I>
 where
@@ -20,10 +22,7 @@ where
 }
 
 enum ChunkState<I: Dist> {
-    Pending(
-        OneSidedMemoryRegion<I>,
-        Box<dyn LamellarArrayRequest<Output = ()>>,
-    ),
+    Pending(OneSidedMemoryRegion<I>, ArrayRdmaHandle),
     Finished,
 }
 
@@ -48,10 +47,7 @@ where
         array: <I as OneSidedIteratorInner>::Array,
         index: usize,
         size: usize,
-    ) -> (
-        OneSidedMemoryRegion<I::ElemType>,
-        Box<dyn LamellarArrayRequest<Output = ()>>,
-    ) {
+    ) -> (OneSidedMemoryRegion<I::ElemType>, ArrayRdmaHandle) {
         // println!(" get chunk of len: {:?}", size);
         let mem_region: OneSidedMemoryRegion<I::ElemType> =
             array.team_rt().alloc_one_sided_mem_region(size);
@@ -76,6 +72,7 @@ where
         let array = self.array();
         let size = std::cmp::min(self.chunk_size, array.len() - self.index);
         let (new_mem_region, new_req) = Self::get_buffer(array, self.index, size);
+        self.index += size;
         self.state = ChunkState::Pending(new_mem_region, new_req);
     }
     fn next(&mut self) -> Option<Self::Item> {
@@ -84,16 +81,18 @@ where
         std::mem::swap(&mut self.state, &mut cur_state);
         match cur_state {
             ChunkState::Pending(mem_region, req) => {
-                if self.index + 1 < array.len() {
+                // println!("next: index: {:?}", self.index);
+                if self.index < array.len() {
                     //prefetch
                     let size = std::cmp::min(self.chunk_size, array.len() - self.index);
-                    self.index += size;
+                    // println!("prefectching: index: {:?} {:?}", self.index, size);
                     let (new_mem_region, new_req) = Self::get_buffer(array, self.index, size);
+                    self.index += size;
                     self.state = ChunkState::Pending(new_mem_region, new_req);
                 } else {
                     self.state = ChunkState::Finished;
                 }
-                req.wait();
+                req.blocking_wait();
                 Some(mem_region)
             }
             ChunkState::Finished => None,
@@ -109,19 +108,18 @@ where
 
         match cur_state {
             ChunkState::Pending(mem_region, mut req) => {
-                if !req.ready() {
-                    req.set_waker(cx.waker().clone());
+                if !req.ready_or_set_waker(cx.waker()) {
                     *this.state = ChunkState::Pending(mem_region, req);
-
-                    // println!("not ready");
                     return Poll::Pending;
                 }
-                if *this.index + 1 < array.len() {
+                // println!("next: index: {:?}", this.index);
+                if *this.index < array.len() {
                     // println!("got chunk! {:?}", *this.index);
                     //prefetch
                     let size = std::cmp::min(*this.chunk_size, array.len() - *this.index);
-                    *this.index += size;
+                    // println!("prefectching: index: {:?} {:?}", this.index, size);
                     let (new_mem_region, new_req) = Self::get_buffer(array, *this.index, size);
+                    *this.index += size;
                     *this.state = ChunkState::Pending(new_mem_region, new_req);
                 } else {
                     // println!("finished chunks!");
@@ -164,7 +162,7 @@ where
     // fn buffered_next(
     //     &mut self,
     //     mem_region: OneSidedMemoryRegion<u8>,
-    // ) -> Option<Box<dyn LamellarArrayRequest<Output = ()>>> {
+    // ) -> Option<ArrayRdmaHandle> {
     //     let array = self.array();
     //     if self.index < array.len() {
     //         let mem_reg_t = unsafe { mem_region.to_base::<I::ElemType>() };
@@ -191,8 +189,8 @@ where
 //     }
 // }
 
-// use futures::task::{Context, Poll};
-// use futures::Stream;
+// use futures_util::task::{Context, Poll};
+// use futures_util::Stream;
 // use std::pin::Pin;
 
 // impl<I> Stream for Chunks<I>
diff --git a/src/array/iterator/one_sided_iterator/skip.rs b/src/array/iterator/one_sided_iterator/skip.rs
index 735dd903..8d480067 100644
--- a/src/array/iterator/one_sided_iterator/skip.rs
+++ b/src/array/iterator/one_sided_iterator/skip.rs
@@ -62,7 +62,7 @@ where
     // fn buffered_next(
     //     &mut self,
     //     mem_region: OneSidedMemoryRegion<u8>,
-    // ) -> Option<Box<dyn LamellarArrayRequest<Output = ()>>> {
+    // ) -> Option<ArrayRdmaHandle> {
     //     self.iter.buffered_next(mem_region)
     // }
     // fn from_mem_region(&self, mem_region: OneSidedMemoryRegion<u8>) -> Option<Self::Item> {
diff --git a/src/array/iterator/one_sided_iterator/step_by.rs b/src/array/iterator/one_sided_iterator/step_by.rs
index fc616fb3..14947053 100644
--- a/src/array/iterator/one_sided_iterator/step_by.rs
+++ b/src/array/iterator/one_sided_iterator/step_by.rs
@@ -72,7 +72,7 @@ where
     // fn buffered_next(
     //     &mut self,
     //     mem_region: OneSidedMemoryRegion<u8>,
-    // ) -> Option<Box<dyn LamellarArrayRequest<Output = ()>>> {
+    // ) -> Option<ArrayRdmaHandle> {
     //     let res = self.iter.buffered_next(mem_region)?;
     //     self.iter.advance_index(self.step_size - 1);
     //     Some(res)
diff --git a/src/array/iterator/one_sided_iterator/zip.rs b/src/array/iterator/one_sided_iterator/zip.rs
index 014f2513..45022cb6 100644
--- a/src/array/iterator/one_sided_iterator/zip.rs
+++ b/src/array/iterator/one_sided_iterator/zip.rs
@@ -6,14 +6,14 @@ use crate::array::iterator::one_sided_iterator::{private::*, *};
 use pin_project::pin_project;
 
 // struct ZipBufferedReq {
-//     reqs: Vec<Box<dyn LamellarArrayRequest<Output = ()>>>,
+//     reqs: Vec<ArrayRdmaHandle>,
 // }
 
 // impl LamellarArrayRequest for ZipBufferedReq {
 //     type Output = ();
 //     async fn into_future(mut self: Box<Self>) -> Self::Output {
 //         for req in self.reqs.drain(0..) {
-//             req.into_future().await;
+//             req.await;
 //         }
 //         ()
 //     }
@@ -176,7 +176,7 @@ where
     // fn buffered_next(
     //     &mut self,
     //     mem_region: OneSidedMemoryRegion<u8>,
-    // ) -> Option<Box<dyn LamellarArrayRequest<Output = ()>>> {
+    // ) -> Option<ArrayRdmaHandle> {
     //     let a_sub_region = mem_region.sub_region(0..self.a.item_size());
     //     let mut reqs = vec![];
     //     reqs.push(self.a.buffered_next(a_sub_region)?);
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index d7ba403d..a7e20d33 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -7,6 +7,7 @@ use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
 use crate::darc::local_rw_darc::LocalRwDarc;
 use crate::darc::DarcMode;
+use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
 // use parking_lot::{
@@ -14,7 +15,10 @@ use crate::memregion::Dist;
 //     RawRwLock,
 // };
 use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
+use pin_project::pin_project;
+
 use std::ops::{Deref, DerefMut};
+use std::task::{Context, Poll, Waker};
 
 /// A safe abstraction of a distributed array, providing read/write access protected by locks.
 ///
@@ -76,7 +80,9 @@ impl LocalLockByteArrayWeak {
 #[derive(Debug)]
 pub struct LocalLockMutLocalData<T: Dist> {
     array: LocalLockArray<T>,
-    _lock_guard: RwLockWriteGuardArc<()>,
+    start_index: usize,
+    end_index: usize,
+    lock_guard: RwLockWriteGuardArc<()>,
 }
 
 // impl<T: Dist> Drop for LocalLockMutLocalData<T> {
@@ -88,12 +94,12 @@ pub struct LocalLockMutLocalData<T: Dist> {
 impl<T: Dist> Deref for LocalLockMutLocalData<T> {
     type Target = [T];
     fn deref(&self) -> &Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+        unsafe { &self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
     }
 }
 impl<T: Dist> DerefMut for LocalLockMutLocalData<T> {
     fn deref_mut(&mut self) -> &mut Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+        unsafe { &mut self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
     }
 }
 
@@ -108,7 +114,9 @@ impl<T: Dist> DerefMut for LocalLockMutLocalData<T> {
 #[derive(Debug)]
 pub struct LocalLockLocalData<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
-    lock: LocalRwDarc<()>,
+    // lock: LocalRwDarc<()>,
+    start_index: usize,
+    end_index: usize,
     lock_guard: Arc<RwLockReadGuardArc<()>>,
 }
 
@@ -117,7 +125,9 @@ impl<'a, T: Dist> Clone for LocalLockLocalData<T> {
         // println!("getting read lock in LocalLockLocalData clone");
         LocalLockLocalData {
             array: self.array.clone(),
-            lock: self.lock.clone(),
+            start_index: self.start_index,
+            end_index: self.end_index,
+            // lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
@@ -155,9 +165,12 @@ impl<'a, T: Dist> LocalLockLocalData<T> {
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
     pub fn into_sub_data(self, start: usize, end: usize) -> LocalLockLocalData<T> {
+        // println!("into sub data {:?} {:?}", start, end);
         LocalLockLocalData {
-            array: self.array.sub_array(start..end),
-            lock: self.lock.clone(),
+            array: self.array.clone(),
+            start_index: start,
+            end_index: end,
+            // lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
@@ -168,7 +181,8 @@ impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<T>
     where
         S: serde::Serializer,
     {
-        unsafe { self.array.array.local_as_mut_slice() }.serialize(serializer)
+        unsafe { &self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
+            .serialize(serializer)
     }
 }
 
@@ -194,7 +208,9 @@ impl<'a, T: Dist> IntoIterator for &'a LocalLockLocalData<T> {
     type IntoIter = LocalLockLocalDataIter<'a, T>;
     fn into_iter(self) -> Self::IntoIter {
         LocalLockLocalDataIter {
-            data: unsafe { self.array.array.local_as_mut_slice() },
+            data: unsafe {
+                &self.array.array.local_as_mut_slice()[self.start_index..self.end_index]
+            },
             index: 0,
         }
     }
@@ -204,7 +220,50 @@ impl<T: Dist> Deref for LocalLockLocalData<T> {
     type Target = [T];
 
     fn deref(&self) -> &Self::Target {
-        unsafe { self.array.array.local_as_mut_slice() }
+        unsafe { &self.array.array.local_as_mut_slice()[self.start_index..self.end_index] }
+    }
+}
+
+#[derive(Clone)]
+pub struct LocalLockReadGuard<T: Dist> {
+    pub(crate) array: LocalLockArray<T>,
+    lock_guard: Arc<RwLockReadGuardArc<()>>,
+}
+
+impl<T: Dist> LocalLockReadGuard<T> {
+    pub fn local_data(&self) -> LocalLockLocalData<T> {
+        LocalLockLocalData {
+            array: self.array.clone(),
+            start_index: 0,
+            end_index: self.array.num_elems_local(),
+            // lock: self.lock.clone(),
+            lock_guard: self.lock_guard.clone(),
+        }
+    }
+}
+
+pub struct LocalLockWriteGuard<T: Dist> {
+    pub(crate) array: LocalLockArray<T>,
+    lock_guard: RwLockWriteGuardArc<()>,
+}
+
+impl<T: Dist> From<LocalLockMutLocalData<T>> for LocalLockWriteGuard<T> {
+    fn from(data: LocalLockMutLocalData<T>) -> Self {
+        LocalLockWriteGuard {
+            array: data.array,
+            lock_guard: data.lock_guard,
+        }
+    }
+}
+
+impl<T: Dist> LocalLockWriteGuard<T> {
+    pub fn local_data(self) -> LocalLockMutLocalData<T> {
+        LocalLockMutLocalData {
+            array: self.array.clone(),
+            start_index: 0,
+            end_index: self.array.num_elems_local(),
+            lock_guard: self.lock_guard,
+        }
     }
 }
 
@@ -260,6 +319,40 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
+    pub fn blocking_read_lock(&self) -> LocalLockReadGuard<T> {
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            LocalLockReadGuard {
+                array: self_clone.clone(),
+                lock_guard: Arc::new(self_clone.lock.read().await),
+            }
+        })
+    }
+
+    pub async fn read_lock(&self) -> LocalLockReadGuard<T> {
+        LocalLockReadGuard {
+            array: self.clone(),
+            lock_guard: Arc::new(self.lock.read().await),
+        }
+    }
+
+    pub fn blocking_write_lock(&self) -> LocalLockWriteGuard<T> {
+        let self_clone: LocalLockArray<T> = self.clone();
+        self.block_on(async move {
+            LocalLockWriteGuard {
+                array: self_clone.clone(),
+                lock_guard: self_clone.lock.write().await,
+            }
+        })
+    }
+
+    pub async fn write_lock(&self) -> LocalLockWriteGuard<T> {
+        LocalLockWriteGuard {
+            array: self.clone(),
+            lock_guard: self.lock.write().await,
+        }
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
     ///
@@ -284,7 +377,9 @@ impl<T: Dist> LocalLockArray<T> {
         self.block_on(async move {
             LocalLockLocalData {
                 array: self_clone.clone(),
-                lock: self_clone.lock.clone(),
+                // lock: self_clone.lock.clone(),
+                start_index: 0,
+                end_index: self_clone.num_elems_local(),
                 lock_guard: Arc::new(self_clone.lock.read().await),
             }
         })
@@ -314,7 +409,9 @@ impl<T: Dist> LocalLockArray<T> {
         // println!("getting read lock in read_local_local");
         LocalLockLocalData {
             array: self.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
+            start_index: 0,
+            end_index: self.num_elems_local(),
             lock_guard: Arc::new(self.lock.read().await),
         }
     }
@@ -343,8 +440,10 @@ impl<T: Dist> LocalLockArray<T> {
         self.block_on(async move {
             let lock = self_clone.lock.write().await;
             let data = LocalLockMutLocalData {
-                array: self_clone,
-                _lock_guard: lock,
+                array: self_clone.clone(),
+                start_index: 0,
+                end_index: self_clone.num_elems_local(),
+                lock_guard: lock,
             };
             // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
             data
@@ -377,7 +476,9 @@ impl<T: Dist> LocalLockArray<T> {
         let lock = self.lock.write().await;
         let data = LocalLockMutLocalData {
             array: self.clone(),
-            _lock_guard: lock,
+            start_index: 0,
+            end_index: self.num_elems_local(),
+            lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
@@ -588,7 +689,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T>
     }
 }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
@@ -824,56 +925,56 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
 }
 
 #[doc(hidden)]
+#[pin_project]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
-    req: Box<dyn LamellarRequest<Output = T>>,
-    _lock_guard: RwLockReadGuardArc<()>,
+    req: AmHandle<T>,
+    lock_guard: Arc<RwLockReadGuardArc<()>>,
 }
 
-#[async_trait]
 impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
-    type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        self.req.into_future().await
+    fn blocking_wait(self) -> Self::Output {
+        self.req.blocking_wait()
     }
-    fn get(&self) -> Self::Output {
-        self.req.get()
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.req.ready_or_set_waker(waker)
     }
-    fn ready(&self) -> bool {
-        self.req.ready()
+    fn val(&self) -> Self::Output {
+        self.req.val()
     }
-    fn set_waker(&mut self, waker: futures::task::Waker) {
-        self.req.set_waker(waker)
+}
+
+impl<T: Dist + AmDist> Future for LocalLockArrayReduceHandle<T> {
+    type Output = T;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.req.ready_or_set_waker(cx.waker()) {
+            true => Poll::Ready(this.req.val()),
+            false => Poll::Pending,
+        }
     }
 }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LocalLockArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let lock: LocalRwDarc<()> = self.lock.clone();
-        let lock = self.array.block_on(async move { lock.read().await });
-        Box::new(LocalLockArrayReduceHandle {
-            req: self.array.reduce_data(op, self.clone().into()),
-            _lock_guard: lock,
-        })
-        .into_future()
+impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
+    pub fn reduce(self, op: &str) -> LocalLockArrayReduceHandle<T> {
+        LocalLockArrayReduceHandle {
+            req: self.array.array.reduce_data(op, self.array.clone().into()),
+            lock_guard: self.lock_guard.clone(),
+        }
     }
 }
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for LocalLockArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
+    pub fn sum(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn prod(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
 }
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for LocalLockArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard<T> {
+    pub fn max(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn min(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
 }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index ae792da6..8cf460e9 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -1,9 +1,5 @@
-use crate::array::iterator::distributed_iterator::{
-    DistIteratorLauncher, DistributedIterator, IndexedDistributedIterator,
-};
-use crate::array::iterator::local_iterator::{
-    IndexedLocalIterator, LocalIterator, LocalIteratorLauncher,
-};
+use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
     private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
@@ -437,7 +433,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
     // }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -449,14 +445,14 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -469,7 +465,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -478,7 +474,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -492,7 +488,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -501,7 +497,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -515,7 +511,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -523,11 +519,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
-    fn collect_async<I, A, B>(
-        &self,
-        iter: &I,
-        d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -542,7 +534,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -552,25 +544,21 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -578,11 +566,7 @@ impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
         DistIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -605,7 +589,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
             .local_subarray_index_from_local(index, chunk_size)
     }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -617,14 +601,14 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -637,7 +621,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -646,7 +630,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
         LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -660,7 +644,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -689,7 +673,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     //     self.array.reduce_async_with_schedule(sched, iter, op)
     // }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -703,7 +687,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -741,25 +725,21 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     //     self.array.collect_async_with_schedule(sched, iter, d)
     // }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
@@ -767,11 +747,7 @@ impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
         LocalIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index 81dd9202..3a521028 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -108,8 +108,10 @@ impl<T: Dist> LocalIterator for LocalLockLocalChunks<T> {
             //     start_i, end_i, self.index, self.end_index
             // );
             Some(LocalLockLocalData {
-                array: self.array.sub_array(start_i..end_i),
-                lock: self.lock.clone(),
+                array: self.array.clone(),
+                start_index: start_i,
+                end_index: end_i,
+                // lock: self.lock.clone(),
                 lock_guard: self.lock_guard.clone(),
             })
         } else {
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index c274578e..9643ddab 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -1,3 +1,5 @@
+use std::collections::VecDeque;
+
 use crate::array::local_lock_atomic::*;
 use crate::array::private::ArrayExecAm;
 use crate::array::LamellarWrite;
@@ -12,25 +14,27 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
-    unsafe fn internal_at(&self, index: usize) -> Box<dyn LamellarArrayRequest<Output = T>> {
+    unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.clone().into(),
         });
-        Box::new(ArrayRdmaAtHandle {
-            reqs: vec![req],
+        ArrayRdmaAtHandle {
+            req: Some(req),
             buf: buf,
-        })
+        }
     }
 }
 
@@ -39,14 +43,16 @@ impl<T: Dist> LamellarArrayGet<T> for LocalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_get(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_get(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
-    fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        unsafe { self.internal_at(index).into_future() }
+    fn at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
+        unsafe { self.internal_at(index) }
     }
 }
 
@@ -55,13 +61,15 @@ impl<T: Dist> LamellarArrayInternalPut<T> for LocalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitPutAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
 }
 
@@ -70,10 +78,12 @@ impl<T: Dist> LamellarArrayPut<T> for LocalLockArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_put(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_put(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
 }
@@ -105,7 +115,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+            reqs.push(self.array.exec_am_pe(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -215,7 +225,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                            reqs.push(self.array.exec_am_pe(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -268,7 +278,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                        reqs.push(self.array.exec_am_pe(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index ba268192..5dde394a 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -990,7 +990,7 @@ impl<T: Dist> NativeAtomicArray<T> {
         self.array.into()
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_  {
+    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
         self.array.async_barrier()
     }
 }
@@ -1004,7 +1004,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<
     }
 }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
@@ -1231,30 +1231,24 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for NativeAtomicArray<T> {
     }
 }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for NativeAtomicArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        self.array
-            .reduce_data(op, self.clone().into())
-            .into_future()
+impl<T: Dist + AmDist + 'static> NativeAtomicArray<T> {
+    pub fn reduce(&self, op: &str) -> AmHandle<T> {
+        self.array.reduce_data(op, self.clone().into())
     }
 }
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for NativeAtomicArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementArithmeticOps + 'static> NativeAtomicArray<T> {
+    pub fn sum(&self) -> AmHandle<T> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn prod(&self) -> AmHandle<T> {
         self.reduce("prod")
     }
 }
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for NativeAtomicArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> NativeAtomicArray<T> {
+    pub fn max(&self) -> AmHandle<T> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn min(&self) -> AmHandle<T> {
         self.reduce("min")
     }
 }
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index e1231bde..7bb9d182 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -1,13 +1,10 @@
-use crate::array::iterator::distributed_iterator::{
-    DistIteratorLauncher, DistributedIterator, IndexedDistributedIterator,
-};
-use crate::array::iterator::local_iterator::{LocalIterator, LocalIteratorLauncher};
+use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
     private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
 };
 use crate::array::native_atomic::*;
-// use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
 use crate::memregion::Dist;
 // use parking_lot::{
@@ -227,7 +224,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
     // }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -239,14 +236,14 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -259,7 +256,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -268,7 +265,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -282,7 +279,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -291,7 +288,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -305,7 +302,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -313,11 +310,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
-    fn collect_async<I, A, B>(
-        &self,
-        iter: &I,
-        d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -332,7 +325,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -342,25 +335,21 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -368,11 +357,7 @@ impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
         DistIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -395,7 +380,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
             .local_subarray_index_from_local(index, chunk_size)
     }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -407,14 +392,14 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -427,7 +412,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -436,7 +421,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
         LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -450,7 +435,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -479,7 +464,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     //     self.array.reduce_async_with_schedule(sched, iter, op)
     // }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -493,7 +478,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -531,25 +516,21 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     //     self.array.collect_async_with_schedule(sched, iter, d)
     // }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
@@ -557,11 +538,7 @@ impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
         LocalIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index 1e40a136..3af2f45d 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -1,3 +1,5 @@
+use std::collections::VecDeque;
+
 use crate::array::native_atomic::*;
 use crate::array::private::ArrayExecAm;
 use crate::array::LamellarWrite;
@@ -9,25 +11,27 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
-    unsafe fn internal_at(&self, index: usize) -> Box<dyn LamellarArrayRequest<Output = T>> {
+    unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.clone().into(),
         });
-        Box::new(ArrayRdmaAtHandle {
-            reqs: vec![req],
+        ArrayRdmaAtHandle {
+            req: Some(req),
             buf: buf,
-        })
+        }
     }
 }
 impl<T: Dist> LamellarArrayGet<T> for NativeAtomicArray<T> {
@@ -35,14 +39,16 @@ impl<T: Dist> LamellarArrayGet<T> for NativeAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_get(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_get(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
-    fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        unsafe { self.internal_at(index).into_future() }
+    fn at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
+        unsafe { self.internal_at(index) }
     }
 }
 
@@ -51,13 +57,15 @@ impl<T: Dist> LamellarArrayInternalPut<T> for NativeAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let req = self.exec_am_local(InitPutAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
         });
-        Box::new(ArrayRdmaHandle { reqs: vec![req] })
+        ArrayRdmaHandle {
+            reqs: VecDeque::from([req.into()]),
+        }
     }
 }
 
@@ -66,10 +74,12 @@ impl<T: Dist> LamellarArrayPut<T> for NativeAtomicArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.array.team_rt()) {
-            Ok(buf) => self.internal_put(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_put(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
 }
@@ -101,7 +111,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+            reqs.push(self.array.exec_am_pe(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -214,7 +224,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                            reqs.push(self.array.exec_am_pe(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -267,7 +277,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+                        reqs.push(self.array.exec_am_pe(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/operations.rs b/src/array/operations.rs
index ebf33677..3577198f 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -432,11 +432,13 @@ pub trait OpInput<'a, T: Dist> {
 
 impl<'a, T: Dist> OpInput<'a, T> for T {
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("val as op input");
         (vec![OpInputEnum::Val(self)], 1)
     }
 }
 impl<'a, T: Dist> OpInput<'a, T> for &T {
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("ref as op input");
         (vec![OpInputEnum::Val(*self)], 1)
     }
 }
@@ -444,6 +446,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &T {
 impl<'a, T: Dist> OpInput<'a, T> for &'a [T] {
     //#[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("slice as op input");
         let len = self.len();
         let mut iters = vec![];
         let num = if len < 1000 {
@@ -473,6 +476,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a [T] {
 
 impl<'a, T: Dist> OpInput<'a, T> for &'a mut (dyn Iterator<Item = T> + 'a) {
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("iter as op input");
         self.collect::<Vec<_>>().as_op_input()
     }
 }
@@ -498,6 +502,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a mut (dyn Iterator<Item = T> + 'a) {
 impl<'a, T: Dist> OpInput<'a, T> for &'a mut [T] {
     //#[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("slice as mut op input");
         let len = self.len();
         let mut iters = vec![];
 
@@ -531,6 +536,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a mut [T] {
 impl<'a, T: Dist> OpInput<'a, T> for &'a Vec<T> {
     //#[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("vec ref as op input");
         (&self[..]).as_op_input()
     }
 }
@@ -538,6 +544,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a Vec<T> {
 impl<'a, T: Dist> OpInput<'a, T> for &'a mut Vec<T> {
     //#[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("vec ref mut as op input");
         (&self[..]).as_op_input()
     }
 }
@@ -572,6 +579,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a mut Vec<T> {
 impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
     //#[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("vec as op input");
         let len = self.len();
         let num = if len < 1000 {
             1
@@ -690,6 +698,7 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
 impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
     // #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("LocalLockLocalData as_op_input {:?}", self.deref());
         let len = self.len();
         let mut iters = vec![];
         let my_pe = self.array.my_pe();
@@ -714,6 +723,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
                 let sub_data = self
                     .clone()
                     .into_sub_data(i * num_per_batch, (i + 1) * num_per_batch);
+                // println!("sub_data: {:?}", sub_data.deref());
                 iters.push(OpInputEnum::LocalLockLocalData(sub_data));
             }
             let rem = len % num_per_batch;
@@ -730,6 +740,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
 impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
     // #[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
+        // println!("GlobalLockLocalData as_op_input");
         let len = self.len();
         let mut iters = vec![];
         let my_pe = self.array.my_pe();
@@ -763,6 +774,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
                 iters.push(OpInputEnum::GlobalLockLocalData(sub_data));
             }
         }
+
         (iters, len)
     }
 }
@@ -1045,23 +1057,23 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //     //#[tracing::instrument(skip_all)]
 //     async fn into_future(mut self: Box<Self>) -> Self::Output {
 //         for req in self.reqs.drain(..) {
-//             req.into_future().await;
+//             req.await;
 //         }
 //         ()
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
+//     fn blocking_wait(&self) -> Self::Output {
 //         for req in &self.reqs {
-//             req.get();
+//             req.blocking_wait();
 //         }
 //         ()
 //     }
 //     fn ready(&self) -> bool {
 //         self.reqs.iter().all(|req| req.ready())
 //     }
-//     fn set_waker(&mut self, waker: futures::task::Waker) {
+//     fn set_waker(&mut self, waker: futures_util::task::Waker) {
 //         for req in &mut self.reqs {
-//             req.set_waker(waker.clone());
+//             req.set_waker(waker);
 //         }
 //     }
 // }
@@ -1079,7 +1091,7 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //         ()
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
+//     fn blocking_wait(&self) -> Self::Output {
 //         for comp in &self.complete {
 //             while comp.load(Ordering::Relaxed) == false {
 //                 // std::thread::yield_now();
@@ -1095,7 +1107,7 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //             .all(|comp| comp.load(Ordering::Relaxed))
 //     }
 
-//     fn set_waker(&mut self, waker: futures::task::Waker) {
+//     fn set_waker(&mut self, waker: futures_util::task::Waker) {
 //         self.complete.iter()
 // }
 
@@ -1105,14 +1117,14 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //     //#[tracing::instrument(skip_all)]
 //     async fn into_future(mut self: Box<Self>) -> Self::Output {
 //         self.req
-//             .into_future()
+//
 //             .await
 //             .pop()
 //             .expect("should have a single request")
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
-//         self.req.get().pop().expect("should have a single request")
+//     fn blocking_wait(&self) -> Self::Output {
+//         self.req.blocking_wait().pop().expect("should have a single request")
 //     }
 
 //     fn ready(&self) -> bool {
@@ -1127,15 +1139,15 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //     async fn into_future(mut self: Box<Self>) -> Self::Output {
 //         let mut res = vec![];
 //         for req in self.reqs.drain(..) {
-//             res.extend(req.into_future().await);
+//             res.extend(req.await);
 //         }
 //         res
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
+//     fn blocking_wait(&self) -> Self::Output {
 //         let mut res = vec![];
 //         for req in &self.reqs {
-//             res.extend(req.get());
+//             res.extend(req.blocking_wait());
 //         }
 //         // println!("res: {:?}",res);
 //         res
@@ -1199,7 +1211,7 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //         self.get_result()
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
+//     fn blocking_wait(&self) -> Self::Output {
 //         for comp in &self.complete {
 //             while comp.load(Ordering::Relaxed) == false {
 //                 // std::thread::yield_now();
@@ -1221,14 +1233,14 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //     //#[tracing::instrument(skip_all)]
 //     async fn into_future(mut self: Box<Self>) -> Self::Output {
 //         self.req
-//             .into_future()
+//
 //             .await
 //             .pop()
 //             .expect("should have a single request")
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
-//         self.req.get().pop().expect("should have a single request")
+//     fn blocking_wait(&self) -> Self::Output {
+//         self.req.blocking_wait().pop().expect("should have a single request")
 //     }
 
 //     fn ready(&self) -> bool {
@@ -1244,15 +1256,15 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //         // println!("num_reqs: {}",self.reqs.len());
 //         let mut res = vec![];
 //         for req in self.reqs.drain(..) {
-//             res.extend(req.into_future().await);
+//             res.extend(req.await);
 //         }
 //         res
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
+//     fn blocking_wait(&self) -> Self::Output {
 //         let mut res = vec![];
 //         for req in &self.reqs {
-//             res.extend(req.get());
+//             res.extend(req.blocking_wait());
 //         }
 //         res
 //     }
@@ -1335,7 +1347,7 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
 //         self.get_result()
 //     }
 //     //#[tracing::instrument(skip_all)]
-//     fn get(&self) -> Self::Output {
+//     fn blocking_wait(&self) -> Self::Output {
 //         for comp in &self.complete {
 //             while comp.load(Ordering::Relaxed) == false {
 //                 // std::thread::yield_now();
diff --git a/src/array/operations/arithmetic.rs b/src/array/operations/arithmetic.rs
index 22a0dfea..2f0e8c39 100644
--- a/src/array/operations/arithmetic.rs
+++ b/src/array/operations/arithmetic.rs
@@ -234,6 +234,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
     ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+        // println!("here in batch_fetch_add");
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
diff --git a/src/array/operations/read_only.rs b/src/array/operations/read_only.rs
index 4c0a2601..bb4b3e68 100644
--- a/src/array/operations/read_only.rs
+++ b/src/array/operations/read_only.rs
@@ -27,7 +27,7 @@ use crate::array::*;
 /// # Examples
 ///```
 /// use lamellar::array::prelude::*;
-/// use futures::future::join_all;
+/// use futures_util::future::join_all;
 ///
 /// let world = LamellarWorldBuilder::new().build();
 /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
diff --git a/src/array/prelude.rs b/src/array/prelude.rs
index 4622c0d7..53e281a5 100644
--- a/src/array/prelude.rs
+++ b/src/array/prelude.rs
@@ -7,9 +7,18 @@ pub use crate::array::r#unsafe::UnsafeArray;
 pub use crate::array::read_only::ReadOnlyArray;
 #[doc(hidden)]
 pub use crate::array::{
-    register_reduction, ArrayOps, Distribution, LamellarArray, LamellarArrayArithmeticReduce,
-    LamellarArrayCompareReduce, LamellarArrayGet, LamellarArrayPut, LamellarArrayReduce,
-    LamellarReadArray, LamellarWriteArray, SubArray,
+    register_reduction,
+    ArrayOps,
+    Distribution,
+    LamellarArray,
+    //LamellarArrayArithmeticReduce,
+    //LamellarArrayCompareReduce,
+    LamellarArrayGet,
+    LamellarArrayPut,
+    //LamellarArrayReduce,
+    LamellarReadArray,
+    LamellarWriteArray,
+    SubArray,
 };
 
 pub use crate::array::iterator::distributed_iterator::{
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 3d1963fa..7cae7245 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -394,7 +394,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
     }
 }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
@@ -488,30 +488,24 @@ impl<T: Dist> From<LamellarByteArray> for ReadOnlyArray<T> {
     }
 }
 
-impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for ReadOnlyArray<T> {
-    fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        self.array
-            .reduce_data(op, self.clone().into())
-            .into_future()
+impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
+    pub fn reduce(&self, op: &str) -> AmHandle<T> {
+        self.array.reduce_data(op, self.clone().into())
     }
 }
-impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-    for ReadOnlyArray<T>
-{
-    fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
+    pub fn sum(&self) -> AmHandle<T> {
         self.reduce("sum")
     }
-    fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn prod(&self) -> AmHandle<T> {
         self.reduce("prod")
     }
 }
-impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-    for ReadOnlyArray<T>
-{
-    fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
+    pub fn max(&self) -> AmHandle<T> {
         self.reduce("max")
     }
-    fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub fn min(&self) -> AmHandle<T> {
         self.reduce("min")
     }
 }
diff --git a/src/array/read_only/iteration.rs b/src/array/read_only/iteration.rs
index af59f35a..8d015740 100644
--- a/src/array/read_only/iteration.rs
+++ b/src/array/read_only/iteration.rs
@@ -1,11 +1,8 @@
-use crate::array::read_only::*;
-
-use crate::array::iterator::distributed_iterator::{
-    DistIter, DistIteratorLauncher, DistributedIterator,
-};
-use crate::array::iterator::local_iterator::{LocalIter, LocalIterator, LocalIteratorLauncher};
+use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{LamellarArrayIterators, Schedule};
+use crate::array::read_only::*;
 use crate::array::*;
 use crate::memregion::Dist;
 
@@ -48,7 +45,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
     // }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -60,14 +57,14 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -80,7 +77,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -89,7 +86,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -103,7 +100,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -112,7 +109,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -126,7 +123,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -134,11 +131,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
     {
         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
     }
-    fn collect_async<I, A, B>(
-        &self,
-        iter: &I,
-        d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -153,7 +146,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -163,25 +156,21 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
     }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -189,11 +178,7 @@ impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
         DistIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -216,7 +201,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
             .local_subarray_index_from_local(index, chunk_size)
     }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -228,14 +213,14 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
     {
         LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
     }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -248,7 +233,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -257,7 +242,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
         LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -271,7 +256,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -300,7 +285,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     //     self.array.reduce_async_with_schedule(sched, iter, op)
     // }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -314,7 +299,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -352,25 +337,21 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     //     self.array.collect_async_with_schedule(sched, iter, d)
     // }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count(&self.array, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
@@ -378,11 +359,7 @@ impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
         LocalIteratorLauncher::sum(&self.array, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
diff --git a/src/array/read_only/rdma.rs b/src/array/read_only/rdma.rs
index 5d5646d9..f9cdbad1 100644
--- a/src/array/read_only/rdma.rs
+++ b/src/array/read_only/rdma.rs
@@ -131,10 +131,10 @@ impl<T: Dist + 'static> LamellarArrayInternalGet<T> for ReadOnlyArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         self.array.internal_get(index, buf)
     }
-    unsafe fn internal_at(&self, index: usize) -> Box<dyn LamellarArrayRequest<Output = T>> {
+    unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         self.array.internal_at(index)
     }
 }
@@ -144,10 +144,10 @@ impl<T: Dist + 'static> LamellarArrayGet<T> for ReadOnlyArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         self.array.get(index, buf)
     }
-    fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    fn at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         unsafe { self.array.at(index) }
     }
 }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 012fc87d..7c86d038 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -13,7 +13,9 @@ use crate::lamellae::AllocationType;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::{Dist, MemoryRegion};
 use crate::LamellarTaskGroup;
+
 use core::marker::PhantomData;
+use futures_util::{future, StreamExt};
 use std::ops::Bound;
 use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
@@ -204,7 +206,7 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         // array.inner.data.print();
     }
 
-    async fn async_new<U: Into<IntoLamellarTeam>>(
+    pub(crate) async fn async_new<U: Into<IntoLamellarTeam>>(
         team: U,
         array_size: usize,
         distribution: Distribution,
@@ -705,7 +707,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
     }
 }
 
-#[async_trait]
+// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (local_vals, distribution) = input;
@@ -724,14 +726,16 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T
         unsafe {
             local_sizes
                 .buffered_onesided_iter(team.num_pes)
-                .into_iter()
+                .into_stream()
                 .enumerate()
                 .for_each(|(i, local_size)| {
                     size += local_size;
                     if i < my_pe {
                         my_start += local_size;
                     }
-                });
+                    future::ready(())
+                })
+                .await;
         }
         let array = UnsafeArray::<T>::async_new(team.clone(), size, distribution).await;
         if local_vals.len() > 0 {
@@ -1095,11 +1099,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
             .get(&(std::any::TypeId::of::<T>(), op))
             .expect("unexpected reduction type")(byte_array, self.inner.data.team.num_pes())
     }
-    pub(crate) fn reduce_data(
-        &self,
-        op: &str,
-        byte_array: LamellarByteArray,
-    ) -> Box<dyn LamellarRequest<Output = T>> {
+    pub(crate) fn reduce_data(&self, op: &str, byte_array: LamellarByteArray) -> AmHandle<T> {
         let func = self.get_reduction_op(op, byte_array);
         if let Ok(my_pe) = self.inner.data.team.team_pe_id() {
             self.inner.data.team.exec_arc_am_pe::<T>(
@@ -1152,8 +1152,8 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
     ///```
-    pub unsafe fn reduce(&self, op: &str) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        self.reduce_data(op, self.clone().into()).into_future()
+    pub unsafe fn reduce(&self, op: &str) -> AmHandle<T> {
+        self.reduce_data(op, self.clone().into())
     }
 
     #[doc(alias("One-sided", "onesided"))]
@@ -1188,7 +1188,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(unsafe{array.sum()}); //Safe in this instance as we have ensured no updates are currently happening
     /// // assert_eq!(array.len()*num_pes,sum);//this may or may not fail
     ///```
-    pub unsafe fn sum(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub unsafe fn sum(&self) -> AmHandle<T> {
         self.reduce("sum")
     }
 
@@ -1225,7 +1225,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    pub unsafe fn prod(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub unsafe fn prod(&self) -> AmHandle<T> {
         self.reduce("prod")
     }
 
@@ -1256,7 +1256,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let max = array.block_on(max_req);
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    pub unsafe fn max(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub unsafe fn max(&self) -> AmHandle<T> {
         self.reduce("max")
     }
 
@@ -1287,7 +1287,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let min = array.block_on(min_req);
     /// assert_eq!(0,min);
     ///```
-    pub unsafe fn min(&self) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    pub unsafe fn min(&self) -> AmHandle<T> {
         self.reduce("min")
     }
 }
diff --git a/src/array/unsafe/iteration/consumer.rs b/src/array/unsafe/iteration/consumer.rs
index df1c080e..e9f1f04d 100644
--- a/src/array/unsafe/iteration/consumer.rs
+++ b/src/array/unsafe/iteration/consumer.rs
@@ -2,27 +2,22 @@ use crate::active_messaging::SyncSend;
 use crate::array::iterator::consumer::*;
 use crate::array::r#unsafe::UnsafeArray;
 use crate::array::LamellarArray;
-
 use crate::memregion::Dist;
 
-use futures::Future;
 use parking_lot::Mutex;
-use std::pin::Pin;
+use std::collections::VecDeque;
 use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
 
 impl<T: Dist> UnsafeArray<T> {
-    pub(crate) fn sched_static<C, AmO, O, I>(
-        &self,
-        cons: C,
-    ) -> Pin<Box<dyn Future<Output = O> + Send>>
+    pub(crate) fn sched_static<C, AmO, O, I>(&self, cons: C) -> C::Handle
     where
         C: IterConsumer<AmOutput = AmO, Output = O, Item = I>,
         AmO: SyncSend + 'static,
         O: SyncSend + 'static,
         // I: SyncSend + 'static,
     {
-        let mut reqs = Vec::new();
+        let mut reqs = VecDeque::new();
         if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
             let num_workers = self.inner.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
@@ -32,7 +27,7 @@ impl<T: Dist> UnsafeArray<T> {
             while ((worker as f64 * elems_per_thread).round() as usize) < num_elems_local {
                 let start_i = (worker as f64 * elems_per_thread).round() as usize;
                 let end_i = ((worker + 1) as f64 * elems_per_thread).round() as usize;
-                reqs.push(
+                reqs.push_back(
                     self.inner.data.task_group.exec_arc_am_local_inner(
                         cons.into_am(IterSchedule::Static(start_i, end_i)),
                     ),
@@ -42,20 +37,16 @@ impl<T: Dist> UnsafeArray<T> {
             }
         }
         cons.create_handle(self.inner.data.team.clone(), reqs)
-            .into_future()
     }
 
-    pub(crate) fn sched_dynamic<C, AmO, O, I>(
-        &self,
-        cons: C,
-    ) -> Pin<Box<dyn Future<Output = O> + Send>>
+    pub(crate) fn sched_dynamic<C, AmO, O, I>(&self, cons: C) -> C::Handle
     where
         C: IterConsumer<AmOutput = AmO, Output = O, Item = I>,
         AmO: SyncSend + 'static,
         O: SyncSend + 'static,
         // I: SyncSend + 'static,
     {
-        let mut reqs = Vec::new();
+        let mut reqs = VecDeque::new();
         if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
             let num_workers = self.inner.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
@@ -64,26 +55,22 @@ impl<T: Dist> UnsafeArray<T> {
             let cur_i = Arc::new(AtomicUsize::new(0));
             // println!("ranges {:?}", ranges);
             for _ in 0..std::cmp::min(num_workers, num_elems_local) {
-                reqs.push(self.inner.data.task_group.exec_arc_am_local_inner(
+                reqs.push_back(self.inner.data.task_group.exec_arc_am_local_inner(
                     cons.into_am(IterSchedule::Dynamic(cur_i.clone(), num_elems_local)),
                 ));
             }
         }
         cons.create_handle(self.inner.data.team.clone(), reqs)
-            .into_future()
     }
 
-    pub(crate) fn sched_work_stealing<C, AmO, O, I>(
-        &self,
-        cons: C,
-    ) -> Pin<Box<dyn Future<Output = O> + Send>>
+    pub(crate) fn sched_work_stealing<C, AmO, O, I>(&self, cons: C) -> C::Handle
     where
         C: IterConsumer<AmOutput = AmO, Output = O, Item = I>,
         AmO: SyncSend + 'static,
         O: SyncSend + 'static,
         // I: SyncSend + 'static,
     {
-        let mut reqs = Vec::new();
+        let mut reqs = VecDeque::new();
         if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
             let num_workers = self.inner.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
@@ -100,7 +87,7 @@ impl<T: Dist> UnsafeArray<T> {
                 worker += 1;
             }
             for sibling in &siblings {
-                reqs.push(
+                reqs.push_back(
                     self.inner
                         .data
                         .task_group
@@ -112,20 +99,16 @@ impl<T: Dist> UnsafeArray<T> {
             }
         }
         cons.create_handle(self.inner.data.team.clone(), reqs)
-            .into_future()
     }
 
-    pub(crate) fn sched_guided<C, AmO, O, I>(
-        &self,
-        cons: C,
-    ) -> Pin<Box<dyn Future<Output = O> + Send>>
+    pub(crate) fn sched_guided<C, AmO, O, I>(&self, cons: C) -> C::Handle
     where
         C: IterConsumer<AmOutput = AmO, Output = O, Item = I>,
         AmO: SyncSend + 'static,
         O: SyncSend + 'static,
         // I: SyncSend + 'static,
     {
-        let mut reqs = Vec::new();
+        let mut reqs = VecDeque::new();
         if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
             let num_workers = self.inner.data.team.num_threads();
             let num_elems_local_orig = cons.max_elems(self.num_elems_local());
@@ -169,27 +152,22 @@ impl<T: Dist> UnsafeArray<T> {
             let range_i = Arc::new(AtomicUsize::new(0));
             // println!("ranges {:?}", ranges);
             for _ in 0..std::cmp::min(num_workers, num_elems_local_orig) {
-                reqs.push(self.inner.data.task_group.exec_arc_am_local_inner(
+                reqs.push_back(self.inner.data.task_group.exec_arc_am_local_inner(
                     cons.into_am(IterSchedule::Chunk(ranges.clone(), range_i.clone())),
                 ));
             }
         }
         cons.create_handle(self.inner.data.team.clone(), reqs)
-            .into_future()
     }
 
-    pub(crate) fn sched_chunk<C, AmO, O, I>(
-        &self,
-        cons: C,
-        chunk_size: usize,
-    ) -> Pin<Box<dyn Future<Output = O> + Send>>
+    pub(crate) fn sched_chunk<C, AmO, O, I>(&self, cons: C, chunk_size: usize) -> C::Handle
     where
         C: IterConsumer<AmOutput = AmO, Output = O, Item = I>,
         AmO: SyncSend + 'static,
         O: SyncSend + 'static,
         // I: SyncSend + 'static,
     {
-        let mut reqs = Vec::new();
+        let mut reqs = VecDeque::new();
         if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
             let num_workers = self.inner.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
@@ -207,12 +185,11 @@ impl<T: Dist> UnsafeArray<T> {
             let range_i = Arc::new(AtomicUsize::new(0));
             // println!("ranges {:?}", ranges);
             for _ in 0..std::cmp::min(num_workers, num_chunks) {
-                reqs.push(self.inner.data.task_group.exec_arc_am_local_inner(
+                reqs.push_back(self.inner.data.task_group.exec_arc_am_local_inner(
                     cons.into_am(IterSchedule::Chunk(ranges.clone(), range_i.clone())),
                 ));
             }
         }
         cons.create_handle(self.inner.data.team.clone(), reqs)
-            .into_future()
     }
 }
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index ae845049..1eba52ae 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -9,7 +9,7 @@ use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 
 use core::marker::PhantomData;
-use futures::Future;
+use futures_util::Future;
 use std::pin::Pin;
 use std::sync::Arc;
 
@@ -39,7 +39,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     //     }
     // }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -52,7 +52,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -71,7 +71,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -85,7 +85,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -105,7 +105,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -119,7 +119,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -138,7 +138,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -152,7 +152,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -172,11 +172,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn collect_async<I, A, B>(
-        &self,
-        iter: &I,
-        d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -191,7 +187,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
@@ -212,18 +208,14 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
         self.count_with_schedule(Schedule::Static, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
         I: DistributedIterator + 'static,
     {
@@ -239,7 +231,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
@@ -247,11 +239,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
         self.sum_with_schedule(Schedule::Static, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum,
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index bc56dd3d..9da52276 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -9,7 +9,7 @@ use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 
 use core::marker::PhantomData;
-use futures::Future;
+use futures_util::Future;
 use std::pin::Pin;
 use std::sync::Arc;
 
@@ -31,7 +31,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -44,7 +44,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -62,7 +62,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -76,7 +76,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -95,7 +95,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -109,7 +109,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         op: F,
-    ) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
+    ) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
@@ -128,38 +128,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    // fn reduce_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.reduce_async_with_schedule(Schedule::Static, iter, op)
-    // }
-
-    // fn reduce_async_with_schedule<I, F, Fut>(&self, sched: Schedule, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static,
-    // {
-    //     let reduce = ReduceAsync{
-    //         iter: iter.clone(),
-    //         op,
-    //         _phantom: PhantomData,
-    //     };
-    //     match sched {
-    //         Schedule::Static => self.sched_static(reduce ),
-    //         Schedule::Dynamic => self.sched_dynamic(reduce),
-    //         Schedule::Chunk(size) => self.sched_chunk(reduce, size),
-    //         Schedule::Guided => self.sched_guided(reduce),
-    //         Schedule::WorkStealing => self.sched_work_stealing(reduce),
-    //     }
-    // }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
+    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -173,7 +142,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         sched: Schedule,
         iter: &I,
         d: Distribution,
-    ) -> Pin<Box<dyn Future<Output = A> + Send>>
+    ) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
@@ -193,49 +162,14 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    // fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I:  LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.collect_async_with_schedule(Schedule::Static,iter,d)
-    // }
-
-    // fn collect_async_with_schedule<I, A, B>(&self, sched: Schedule, iter: &I, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I:  LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     let collect = CollectAsync{
-    //         iter: iter.clone(),
-    //         distribution: d,
-    //         _phantom: PhantomData,
-    //     };
-    //     match sched {
-    //         Schedule::Static => self.sched_static(collect ),
-    //         Schedule::Dynamic => self.sched_dynamic(collect),
-    //         Schedule::Chunk(size) => self.sched_chunk(collect, size),
-    //         Schedule::Guided => self.sched_guided(collect),
-    //         Schedule::WorkStealing => self.sched_work_stealing(collect),
-    //     }
-    // }
-
-    fn count<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
         self.count_with_schedule(Schedule::Static, iter)
     }
 
-    fn count_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = usize> + Send>>
+    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
         I: LocalIterator + 'static,
     {
@@ -251,7 +185,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         }
     }
 
-    fn sum<I>(&self, iter: &I) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
@@ -259,11 +193,7 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
         self.sum_with_schedule(Schedule::Static, iter)
     }
 
-    fn sum_with_schedule<I>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-    ) -> Pin<Box<dyn Future<Output = I::Item> + Send>>
+    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum,
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 23b066bd..842165dd 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -2,12 +2,12 @@ use crate::active_messaging::LamellarArcAm;
 use crate::array::operations::*;
 use crate::array::r#unsafe::UnsafeArray;
 use crate::array::{AmDist, Dist, LamellarArray, LamellarByteArray, LamellarEnv};
-use futures::Future;
+use futures_util::Future;
 use parking_lot::Mutex;
 use std::any::TypeId;
 use std::collections::HashMap;
 use std::pin::Pin;
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;
 
 type MultiValMultiIdxFn = fn(LamellarByteArray, ArrayOpCmd<Vec<u8>>, Vec<u8>, u8) -> LamellarArcAm;
@@ -160,7 +160,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             .max()
             .unwrap();
         let index_size = IndexSize::from(max_local_size);
-
+        let data_copied = Arc::new(AtomicBool::new(false));
         let res: Pin<Box<dyn Future<Output = Vec<((), Vec<usize>)>> + Send>> =
             if v_len == 1 && i_len == 1 {
                 //one to one
@@ -170,6 +170,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     indices[0].first(),
                     op,
                     BatchReturnType::None,
+                    data_copied.clone(),
                 )
             } else if v_len > 1 && i_len == 1 {
                 //many vals one index
@@ -180,6 +181,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::None,
                     index_size,
+                    data_copied.clone(),
                 )
             } else if v_len == 1 && i_len > 1 {
                 //one val many indices
@@ -190,6 +192,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::None,
                     index_size,
+                    data_copied.clone(),
                 )
             } else if v_len > 1 && i_len > 1 {
                 //many vals many indices
@@ -200,6 +203,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::None,
                     index_size,
+                    data_copied.clone(),
                 )
             } else {
                 //no vals no indices
@@ -227,7 +231,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             .max()
             .unwrap();
         let index_size = IndexSize::from(max_local_size);
-        // println!("i_len {:?} v_len {:?}",i_len,v_len );
+        let data_copied = Arc::new(AtomicBool::new(false));
+        // println!("i_len {:?} v_len {:?}", i_len, v_len);
         let res: Pin<Box<dyn Future<Output = Vec<(Vec<T>, Vec<usize>)>> + Send>> =
             if v_len == 1 && i_len == 1 {
                 //one to one
@@ -237,6 +242,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     indices[0].first(),
                     op,
                     BatchReturnType::Vals,
+                    data_copied.clone(),
                 )
             } else if v_len > 1 && i_len == 1 {
                 //many vals one index
@@ -247,6 +253,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::Vals,
                     index_size,
+                    data_copied.clone(),
                 )
             } else if v_len == 1 && i_len > 1 {
                 //one val many indices
@@ -257,6 +264,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::Vals,
                     index_size,
+                    data_copied.clone(),
                 )
             } else if v_len > 1 && i_len > 1 {
                 //many vals many indices
@@ -267,6 +275,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::Vals,
                     index_size,
+                    data_copied.clone(),
                 )
             } else {
                 //no vals no indices
@@ -279,7 +288,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                 results.set_len(std::cmp::max(i_len, v_len));
             }
             for (mut vals, mut idxs) in res.await.into_iter() {
-                // println!("vals {:?} idx {:?}",vals.len(),idxs);
+                // println!("vals {:?} idx {:?}", vals.len(), idxs);
                 for (v, i) in vals.drain(..).zip(idxs.drain(..)) {
                     results[i] = v;
                 }
@@ -304,7 +313,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             .max()
             .unwrap();
         let index_size = IndexSize::from(max_local_size);
-
+        let data_copied = Arc::new(AtomicBool::new(false));
         let res: Pin<Box<dyn Future<Output = Vec<(Vec<Result<T, T>>, Vec<usize>)>> + Send>> =
             if v_len == 1 && i_len == 1 {
                 //one to one
@@ -314,6 +323,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     indices[0].first(),
                     op,
                     BatchReturnType::Result,
+                    data_copied.clone(),
                 )
             } else if v_len > 1 && i_len == 1 {
                 //many vals one index
@@ -324,6 +334,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::Result,
                     index_size,
+                    data_copied.clone(),
                 )
             } else if v_len == 1 && i_len > 1 {
                 //one val many indices
@@ -334,6 +345,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::Result,
                     index_size,
+                    data_copied.clone(),
                 )
             } else if v_len > 1 && i_len > 1 {
                 //many vals many indices
@@ -344,6 +356,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     op,
                     BatchReturnType::Result,
                     index_size,
+                    data_copied.clone(),
                 )
             } else {
                 //no vals no indices
@@ -372,6 +385,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
         index_size: IndexSize,
+        data_copied: Arc<AtomicBool>,
     ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
         let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
             Ok(n) => n.parse::<usize>().unwrap(),
@@ -393,8 +407,9 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
             let index_vec = index.to_vec();
+            // println!("index vec: {:?}", index_vec);
             let the_array: UnsafeArray<T> = self.clone();
-            // println!("num_reqs {:?}",num_reqs);
+            // println!("num_reqs {:?}", num_reqs);
             self.inner
                 .data
                 .team
@@ -416,6 +431,10 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 the_array.inner.size
                             ),
                         };
+                        // println!(
+                        //     "pe: {:?} index: {:?} local_index: {:?}",
+                        //     pe, *idx, local_index
+                        // );
                         buffs[pe].extend_from_slice(index_size.as_bytes(&local_index));
                         res_buffs[pe].push(j);
                         if buffs[pe].len() >= num_per_batch {
@@ -433,16 +452,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = the_array
-                                .inner
-                                .data
-                                .team
-                                .exec_arc_am_pe::<R>(
-                                    pe,
-                                    am,
-                                    Some(the_array.inner.data.array_counters.clone()),
-                                )
-                                .into_future();
+                            let req = the_array.inner.data.team.exec_arc_am_pe::<R>(
+                                pe,
+                                am,
+                                Some(the_array.inner.data.array_counters.clone()),
+                            );
                             reqs.push(Box::pin(async move { (req.await, new_res_buffer) }));
                         }
                     }
@@ -458,20 +472,15 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = the_array
-                                .inner
-                                .data
-                                .team
-                                .exec_arc_am_pe::<R>(
-                                    pe,
-                                    am,
-                                    Some(the_array.inner.data.array_counters.clone()),
-                                )
-                                .into_future();
+                            let req = the_array.inner.data.team.exec_arc_am_pe::<R>(
+                                pe,
+                                am,
+                                Some(the_array.inner.data.array_counters.clone()),
+                            );
                             reqs.push(Box::pin(async move { (req.await, res_buff) }));
                         }
                     }
-                    // println!("reqs len {:?}",reqs.len());
+                    // println!("reqs len {:?}", reqs.len());
                     futures2.lock().extend(reqs);
                     cnt2.fetch_add(1, Ordering::SeqCst);
                     the_array
@@ -489,10 +498,10 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
         }
-        // println!("futures len {:?}",futures.lock().len());
+        // println!("futures len {:?}", futures.lock().len());
         Box::pin(async move {
-            // println!("futures len {:?}",futures.lock().len());
-            futures::future::join_all(futures.lock().drain(..)).await
+            // println!("futures len {:?}", futures.lock().len());
+            futures_util::future::join_all(futures.lock().drain(..)).await
         })
     }
 
@@ -506,6 +515,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
         _index_size: IndexSize,
+        data_copied: Arc<AtomicBool>,
     ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
         let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
             Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
@@ -548,16 +558,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                         val,
                     )
                     .into_am::<T>(ret);
-                    let req = the_array
-                        .inner
-                        .data
-                        .team
-                        .exec_arc_am_pe::<R>(
-                            pe,
-                            am,
-                            Some(the_array.inner.data.array_counters.clone()),
-                        )
-                        .into_future();
+                    let req = the_array.inner.data.team.exec_arc_am_pe::<R>(
+                        pe,
+                        am,
+                        Some(the_array.inner.data.array_counters.clone()),
+                    );
                     // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
                     let res_buffer =
                         (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
@@ -585,7 +590,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         // println!("futures len {:?}",futures.lock().len());
         Box::pin(async move {
             // println!("futures len {:?}",futures.lock().len());
-            futures::future::join_all(futures.lock().drain(..)).await
+            futures_util::future::join_all(futures.lock().drain(..)).await
         })
     }
 
@@ -597,6 +602,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
         index_size: IndexSize,
+        data_copied: Arc<AtomicBool>,
     ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
         let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
             Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
@@ -704,16 +710,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = the_array
-                                .inner
-                                .data
-                                .team
-                                .exec_arc_am_pe::<R>(
-                                    pe,
-                                    am,
-                                    Some(the_array.inner.data.array_counters.clone()),
-                                )
-                                .into_future();
+                            let req = the_array.inner.data.team.exec_arc_am_pe::<R>(
+                                pe,
+                                am,
+                                Some(the_array.inner.data.array_counters.clone()),
+                            );
                             reqs.push(Box::pin(async move { (req.await, new_res_buffer) }));
                         }
                     }
@@ -729,16 +730,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 index_size,
                             )
                             .into_am::<T>(ret);
-                            let req = the_array
-                                .inner
-                                .data
-                                .team
-                                .exec_arc_am_pe::<R>(
-                                    pe,
-                                    am,
-                                    Some(the_array.inner.data.array_counters.clone()),
-                                )
-                                .into_future();
+                            let req = the_array.inner.data.team.exec_arc_am_pe::<R>(
+                                pe,
+                                am,
+                                Some(the_array.inner.data.array_counters.clone()),
+                            );
                             reqs.push(Box::pin(async move { (req.await, res_buff) }));
                         }
                     }
@@ -761,7 +757,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         // println!("futures len {:?}", futures.lock().len());
         Box::pin(async move {
             // println!("futures len: {:?}", futures.lock().len());
-            futures::future::join_all(futures.lock().drain(..)).await
+            futures_util::future::join_all(futures.lock().drain(..)).await
         })
     }
 
@@ -772,6 +768,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index: usize,
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
+        data_copied: Arc<AtomicBool>,
     ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
         let (pe, local_index) = match self.pe_and_offset_for_global_index(index) {
             Some((pe, local_index)) => (pe, local_index),
@@ -791,15 +788,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         let res_buff = vec![0];
         let am = MultiValMultiIndex::new_with_vec(byte_array.clone(), op, buff, IndexSize::Usize)
             .into_am::<T>(ret);
-        let req = self
-            .inner
-            .data
-            .team
-            .exec_arc_am_pe::<R>(pe, am, Some(self.inner.data.array_counters.clone()))
-            .into_future();
+        let req = self.inner.data.team.exec_arc_am_pe::<R>(
+            pe,
+            am,
+            Some(self.inner.data.array_counters.clone()),
+        );
         let mut reqs = vec![Box::pin(async move { (req.await, res_buff) })];
 
-        Box::pin(async move { futures::future::join_all(reqs.drain(..)).await })
+        Box::pin(async move { futures_util::future::join_all(reqs.drain(..)).await })
     }
 }
 
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index d5b9833a..e43d9956 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -1,7 +1,8 @@
+use std::collections::VecDeque;
+
 use crate::array::private::ArrayExecAm;
 use crate::array::r#unsafe::*;
 use crate::array::*;
-use crate::lamellar_request::LamellarRequest;
 use crate::memregion::{
     AsBase, Dist, MemoryRegionRDMA, RTMemoryRegionRDMA, RegisteredMemoryRegion, SubRegion,
 };
@@ -14,7 +15,7 @@ impl<T: Dist> UnsafeArray<T> {
         op: ArrayRdmaCmd,
         index: usize, //relative to inner
         buf: U,
-    ) -> Vec<Box<dyn LamellarRequest<Output = ()>>> {
+    ) -> VecDeque<AmHandle<()>> {
         let global_index = index + self.inner.offset;
         // let buf = buf.team_into(&self.inner.data.team);
         let buf = buf.into();
@@ -36,7 +37,7 @@ impl<T: Dist> UnsafeArray<T> {
         let mut dist_index = global_index;
         // let mut subarray_index = index;
         let mut buf_index = 0;
-        let mut reqs = vec![];
+        let mut reqs = VecDeque::new();
         for pe in start_pe..=end_pe {
             let num_elems_on_pe = (self.inner.orig_elem_per_pe * (pe + 1) as f64).round() as usize
                 - (self.inner.orig_elem_per_pe * pe as f64).round() as usize;
@@ -87,7 +88,7 @@ impl<T: Dist> UnsafeArray<T> {
                                 },
                                 pe: self.inner.data.my_pe,
                             };
-                            reqs.push(self.exec_am_pe(pe, am));
+                            reqs.push_back(self.exec_am_pe(pe, am));
                         } else {
                             let am = UnsafeSmallPutAm {
                                 array: self.clone().into(),
@@ -101,7 +102,7 @@ impl<T: Dist> UnsafeArray<T> {
                                         .to_vec()
                                 },
                             };
-                            reqs.push(self.exec_am_pe(pe, am));
+                            reqs.push_back(self.exec_am_pe(pe, am));
                         }
                     }
                     ArrayRdmaCmd::GetAm => {
@@ -116,7 +117,7 @@ impl<T: Dist> UnsafeArray<T> {
                             },
                             pe: pe,
                         };
-                        reqs.push(self.exec_am_local(am));
+                        reqs.push_back(self.exec_am_local(am).into());
                         // }
                         // else {
                         //     let am = UnsafeSmallBlockGetAm {
@@ -144,7 +145,7 @@ impl<T: Dist> UnsafeArray<T> {
         op: ArrayRdmaCmd,
         index: usize, //global_index
         buf: U,
-    ) -> Vec<Box<dyn LamellarRequest<Output = ()>>> {
+    ) -> VecDeque<AmHandle<()>> {
         let global_index = index + self.inner.offset;
         // let buf = buf.team_into(&self.inner.data.team);
         let buf = buf.into();
@@ -153,7 +154,7 @@ impl<T: Dist> UnsafeArray<T> {
         let num_elems_pe = buf.len() / num_pes + 1; //we add plus one to ensure we allocate enough space
         let mut overflow = 0;
         let start_pe = global_index % num_pes;
-        let mut reqs = vec![];
+        let mut reqs = VecDeque::new();
         // println!("start_pe {:?} num_elems_pe {:?} buf len {:?}",start_pe,num_elems_pe,buf.len());
         match op {
             ArrayRdmaCmd::Put => {
@@ -208,7 +209,7 @@ impl<T: Dist> UnsafeArray<T> {
                             data: unsafe { temp_memreg.to_base::<u8>().into() },
                             pe: self.inner.data.my_pe,
                         };
-                        reqs.push(self.exec_am_pe(pe, am));
+                        reqs.push_back(self.exec_am_pe(pe, am));
                     } else {
                         let am = UnsafeSmallPutAm {
                             array: self.clone().into(),
@@ -223,7 +224,7 @@ impl<T: Dist> UnsafeArray<T> {
                                     .to_vec()
                             },
                         };
-                        reqs.push(self.exec_am_pe(pe, am));
+                        reqs.push_back(self.exec_am_pe(pe, am));
                     }
                     if pe + 1 == num_pes {
                         overflow += 1;
@@ -296,7 +297,7 @@ impl<T: Dist> UnsafeArray<T> {
                         num_pes: num_pes,
                         offset: offset,
                     };
-                    reqs.push(self.exec_am_local(am));
+                    reqs.push_back(self.exec_am_local(am).into());
                     if pe + 1 == num_pes {
                         overflow += 1;
                     }
@@ -613,26 +614,25 @@ impl<T: Dist> UnsafeArray<T> {
     /// PE3: buf data [12,12,12,12,12,12,12,12,12,12,12,12]
     /// PE0: buf data [0,1,2,3,4,5,6,7,8,9,10,11] //we only did the "get" on PE0, also likely to be printed last since the other PEs do not wait for PE0 in this example
     ///```
-    pub unsafe fn get<U>(&self, index: usize, buf: U) -> Pin<Box<dyn Future<Output = ()> + Send>>
+    pub unsafe fn get<U>(&self, index: usize, buf: U) -> ArrayRdmaHandle
     where
         U: TeamTryInto<LamellarArrayRdmaOutput<T>>,
     {
         match buf.team_try_into(&self.team_rt()) {
-            Ok(buf) => self.internal_get(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_get(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
 
-    pub(crate) unsafe fn internal_at(
-        &self,
-        index: usize,
-    ) -> Box<dyn LamellarArrayRequest<Output = T>> {
+    pub(crate) unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         let buf: OneSidedMemoryRegion<T> = self.team_rt().alloc_one_sided_mem_region(1);
         self.blocking_get(index, &buf);
-        Box::new(ArrayRdmaAtHandle {
-            reqs: vec![],
+        ArrayRdmaAtHandle {
+            req: None,
             buf: buf,
-        })
+        }
     }
 
     #[doc(alias("One-sided", "onesided"))]
@@ -679,8 +679,8 @@ impl<T: Dist> UnsafeArray<T> {
     /// PE2: array[9] = 3
     /// PE3: array[0] = 0
     ///```
-    pub unsafe fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        self.internal_at(index).into_future()
+    pub unsafe fn at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
+        self.internal_at(index)
     }
 }
 
@@ -691,11 +691,11 @@ impl<T: Dist> UnsafeArray<T> {
 //         index: usize,
 //         dst: U,
 //     ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
-//         self.internal_get(index, dst).into_future()
+//         self.internal_get(index, dst)
 //     }
 
 //     fn at(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
-//         self.internal_at(index).into_future()
+//         self.internal_at(index)
 //     }
 // }
 
@@ -704,7 +704,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for UnsafeArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let buf = buf.into();
         let reqs = if buf.len() * std::mem::size_of::<T>() > crate::active_messaging::BATCH_AM_SIZE
         {
@@ -718,12 +718,14 @@ impl<T: Dist> LamellarArrayInternalGet<T> for UnsafeArray<T> {
                 index: index,
                 buf: buf,
             });
-            vec![req]
+            let mut reqs = VecDeque::new();
+            reqs.push_back(req.into());
+            reqs
         };
-        Box::new(ArrayRdmaHandle { reqs: reqs })
+        ArrayRdmaHandle { reqs: reqs }
     }
 
-    unsafe fn internal_at(&self, index: usize) -> Box<dyn LamellarArrayRequest<Output = T>> {
+    unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         self.internal_at(index)
     }
 }
@@ -733,12 +735,12 @@ impl<T: Dist> LamellarArrayInternalPut<T> for UnsafeArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Box<dyn LamellarArrayRequest<Output = ()>> {
+    ) -> ArrayRdmaHandle {
         let reqs = match self.inner.distribution {
             Distribution::Block => self.block_op(ArrayRdmaCmd::PutAm, index, buf.into()),
             Distribution::Cyclic => self.cyclic_op(ArrayRdmaCmd::PutAm, index, buf.into()),
         };
-        Box::new(ArrayRdmaHandle { reqs: reqs })
+        ArrayRdmaHandle { reqs: reqs }
     }
 }
 
@@ -747,10 +749,12 @@ impl<T: Dist> LamellarArrayPut<T> for UnsafeArray<T> {
         &self,
         index: usize,
         buf: U,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayRdmaHandle {
         match buf.team_try_into(&self.team_rt()) {
-            Ok(buf) => self.internal_put(index, buf).into_future(),
-            Err(_) => Box::pin(async move { () }),
+            Ok(buf) => self.internal_put(index, buf),
+            Err(_) => ArrayRdmaHandle {
+                reqs: VecDeque::new(),
+            },
         }
     }
 }
@@ -1055,7 +1059,7 @@ impl<T: Dist + 'static> LamellarAm for InitSmallGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am).into_future());
+            reqs.push(self.array.exec_am_pe(pe, remote_am));
         }
         unsafe {
             match self.array.inner.distribution {
diff --git a/src/darc.rs b/src/darc.rs
index 454c91fb..f476cdf2 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -46,7 +46,6 @@
 ///```
 use async_lock::RwLock;
 use core::marker::PhantomData;
-use futures::Future;
 use serde::{Deserialize, Deserializer};
 use std::cmp::PartialEq;
 use std::fmt;
@@ -59,7 +58,7 @@ use std::sync::Arc;
 
 // //use tracing::*;
 
-use crate::active_messaging::{AMCounters, RemotePtr};
+use crate::active_messaging::{AMCounters, AmHandle, RemotePtr};
 use crate::barrier::Barrier;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
@@ -350,7 +349,7 @@ impl<T> DarcInner<T> {
         unsafe { &(*self.item) }
     }
 
-    fn send_finished(&self) -> Vec<Pin<Box<dyn Future<Output = ()> + Send>>> {
+    fn send_finished(&self) -> Vec<AmHandle<()>> {
         let ref_cnts = unsafe {
             std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut AtomicUsize, self.num_pes)
         };
@@ -374,18 +373,15 @@ impl<T> DarcInner<T> {
                 //     my_addr
                 // );
                 // println!("[{:?}] {:?}", std::thread::current().id(), self);
-                reqs.push(
-                    team.exec_am_pe_tg(
-                        pe,
-                        FinishedAm {
-                            cnt: cnt,
-                            src_pe: pe,
-                            inner_addr: pe_addr,
-                        },
-                        Some(self.am_counters()),
-                    )
-                    .into_future(),
-                );
+                reqs.push(team.exec_am_pe_tg(
+                    pe,
+                    FinishedAm {
+                        cnt: cnt,
+                        src_pe: pe,
+                        inner_addr: pe_addr,
+                    },
+                    Some(self.am_counters()),
+                ));
             }
         }
         reqs
@@ -832,6 +828,14 @@ impl<T> Darc<T> {
         Darc::try_new_with_drop(team, item, DarcMode::Darc, None)
     }
 
+    pub(crate) async fn async_try_new<U: Into<IntoLamellarTeam>>(
+        team: U,
+        item: T,
+        state: DarcMode,
+    ) -> Result<Darc<T>, IdError> {
+        Darc::async_try_new_with_drop(team, item, state, None).await
+    }
+
     pub(crate) fn try_new<U: Into<IntoLamellarTeam>>(
         team: U,
         item: T,
@@ -840,6 +844,121 @@ impl<T> Darc<T> {
         Darc::try_new_with_drop(team, item, state, None)
     }
 
+    pub(crate) async fn async_try_new_with_drop<U: Into<IntoLamellarTeam>>(
+        team: U,
+        item: T,
+        state: DarcMode,
+        drop: Option<fn(&mut T)>,
+    ) -> Result<Darc<T>, IdError> {
+        let team_rt = team.into().team.clone();
+        let my_pe = team_rt.team_pe?;
+
+        let alloc = if team_rt.num_pes == team_rt.num_world_pes {
+            AllocationType::Global
+        } else {
+            AllocationType::Sub(team_rt.get_pes())
+        };
+
+        let size = std::mem::size_of::<DarcInner<T>>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<usize>();
+        // println!("creating new darc");
+
+        team_rt.async_barrier().await;
+        // println!("creating new darc after barrier");
+        let addr = team_rt
+            .lamellae
+            .alloc(size, alloc, std::mem::align_of::<DarcInner<T>>())
+            .expect("out of memory");
+        // let temp_team = team_rt.clone();
+        // team_rt.print_cnt();
+        let team_ptr = unsafe {
+            let pinned_team = Pin::into_inner_unchecked(team_rt.clone());
+            Arc::into_raw(pinned_team)
+        };
+        // team_rt.print_cnt();
+        let am_counters = Arc::new(AMCounters::new());
+        let am_counters_ptr = Arc::into_raw(am_counters);
+        let barrier = Box::new(Barrier::new(
+            team_rt.world_pe,
+            team_rt.num_world_pes,
+            team_rt.lamellae.clone(),
+            team_rt.arch.clone(),
+            team_rt.scheduler.clone(),
+            team_rt.panic.clone(),
+        ));
+        let barrier_ptr = Box::into_raw(barrier);
+        let darc_temp = DarcInner {
+            id: DARC_ID.fetch_add(1, Ordering::Relaxed),
+            my_pe: my_pe,
+            num_pes: team_rt.num_pes,
+            local_cnt: AtomicUsize::new(1),
+            total_local_cnt: AtomicUsize::new(1),
+            weak_local_cnt: AtomicUsize::new(0),
+            dist_cnt: AtomicUsize::new(0),
+            total_dist_cnt: AtomicUsize::new(0),
+            ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+            total_ref_cnt_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            mode_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            mode_ref_cnt_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+            mode_barrier_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            barrier: barrier_ptr,
+            // mode_barrier_rounds: num_rounds,
+            am_counters: am_counters_ptr,
+            team: team_ptr, //&team_rt, //Arc::into_raw(temp_team),
+            item: Box::into_raw(Box::new(item)),
+            drop: drop,
+            valid: AtomicBool::new(true),
+        };
+        unsafe {
+            std::ptr::copy_nonoverlapping(&darc_temp, addr as *mut DarcInner<T>, 1);
+        }
+        // println!("Darc Inner Item Addr: {:?}", darc_temp.item);
+
+        let d = Darc {
+            inner: addr as *mut DarcInner<T>,
+            src_pe: my_pe,
+        };
+        for elem in d.ref_cnts_as_mut_slice() {
+            *elem = 0;
+        }
+        for elem in d.mode_as_mut_slice() {
+            *elem = state;
+        }
+        for elem in d.mode_barrier_as_mut_slice() {
+            *elem = 0;
+        }
+        for elem in d.mode_ref_cnt_as_mut_slice() {
+            *elem = 0;
+        }
+        // println!(
+        //     " [{:?}] created new darc , next_id: {:?}",
+        //     std::thread::current().id(),
+        //     DARC_ID.load(Ordering::Relaxed)
+        // );
+        // d.print();
+        team_rt.async_barrier().await;
+        // team_rt.print_cnt();
+        Ok(d)
+    }
+
     pub(crate) fn try_new_with_drop<U: Into<IntoLamellarTeam>>(
         team: U,
         item: T,
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index cbb5cbaa..ba3ec400 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -11,6 +11,7 @@ use crate::active_messaging::RemotePtr;
 use crate::darc::local_rw_darc::LocalRwDarc;
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
+use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
@@ -278,7 +279,7 @@ impl<T> Clone for GlobalRwDarcReadGuard<T> {
 
 impl<T> Drop for GlobalRwDarcReadGuard<T> {
     fn drop(&mut self) {
-        // println!("dropping read guard");
+        // println!("dropping global rwdarc read guard");
         if self.local_cnt.fetch_sub(1, Ordering::SeqCst) == 1 {
             let inner = self.rwlock.inner();
             let team = inner.team();
@@ -574,7 +575,6 @@ impl<T> GlobalRwDarc<T> {
             },
             Some(inner.am_counters()),
         )
-        .into_future()
         .await;
         // println!("TID: {:?} async got read lock", std::thread::current().id());
         GlobalRwDarcReadGuard {
@@ -649,7 +649,6 @@ impl<T> GlobalRwDarc<T> {
             },
             Some(inner.am_counters()),
         )
-        .into_future()
         .await;
         GlobalRwDarcWriteGuard {
             rwlock: self.darc.clone(),
@@ -720,7 +719,6 @@ impl<T> GlobalRwDarc<T> {
             },
             Some(inner.am_counters()),
         )
-        .into_future()
         .await;
         GlobalRwDarcCollectiveWriteGuard {
             rwlock: self.darc.clone(),
@@ -777,7 +775,7 @@ impl<T> GlobalRwDarc<T> {
             },
             Some(inner.am_counters()),
         )
-        .get();
+        .blocking_wait();
         GlobalRwDarcReadGuard {
             rwlock: self.darc.clone(),
             marker: PhantomData,
@@ -830,7 +828,7 @@ impl<T> GlobalRwDarc<T> {
             },
             Some(inner.am_counters()),
         )
-        .get();
+        .blocking_wait();
         GlobalRwDarcWriteGuard {
             rwlock: self.darc.clone(),
             marker: PhantomData,
@@ -899,7 +897,7 @@ impl<T> GlobalRwDarc<T> {
             },
             Some(inner.am_counters()),
         )
-        .get();
+        .blocking_wait();
         GlobalRwDarcCollectiveWriteGuard {
             rwlock: self.darc.clone(),
             collective_cnt: collective_cnt,
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index 37bbcb2f..de09a278 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -11,8 +11,8 @@ use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
 use async_trait::async_trait;
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
+use futures_util::stream::FuturesUnordered;
+use futures_util::StreamExt;
 
 pub(crate) struct RofiBuilder {
     my_pe: usize,
diff --git a/src/lamellae/shmem_lamellae.rs b/src/lamellae/shmem_lamellae.rs
index b4008bcf..fdfb62ff 100644
--- a/src/lamellae/shmem_lamellae.rs
+++ b/src/lamellae/shmem_lamellae.rs
@@ -12,8 +12,8 @@ use std::sync::atomic::{AtomicU8, Ordering};
 use std::sync::Arc;
 
 use async_trait::async_trait;
-use futures::stream::FuturesUnordered;
-use futures::StreamExt;
+use futures_util::stream::FuturesUnordered;
+use futures_util::StreamExt;
 
 pub(crate) struct ShmemBuilder {
     my_pe: usize,
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 2711c8a1..6f9e1bac 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -1,17 +1,15 @@
-use crate::active_messaging::{AmDist, DarcSerde, LamellarAny, RemotePtr, SyncSend};
+use crate::active_messaging::{
+    AmHandleInner, DarcSerde, LamellarAny, MultiAmHandleInner, RemotePtr,
+};
 use crate::darc::Darc;
-use crate::lamellae::{Des, SerializedData};
-use crate::lamellar_arch::LamellarArchRT;
+use crate::lamellae::SerializedData;
+use crate::lamellar_task_group::{TaskGroupAmHandleInner, TaskGroupMultiAmHandleInner};
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::Scheduler;
-use async_trait::async_trait;
-use futures::task::Waker;
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::Arc;
 
-use parking_lot::{Condvar, Mutex};
-use std::cell::Cell;
-use std::collections::HashMap;
+use std::future::Future;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+use std::task::Waker;
 
 #[derive(Debug)]
 pub(crate) enum InternalResult {
@@ -20,48 +18,104 @@ pub(crate) enum InternalResult {
     Unit,
 }
 
-#[doc(hidden)]
-#[async_trait]
-pub trait LamellarRequest: Sync + Send {
-    type Output;
-    async fn into_future(mut self: Box<Self>) -> Self::Output;
-    fn get(&self) -> Self::Output;
-    fn ready(&self) -> bool;
-    fn set_waker(&mut self, waker: Waker);
-}
+// #[enum_dispatch(Future, LamellarRequest)]
+// pub(crate) enum LamellarHandle<T> {
+//     SinglePeAm(AmHandle<T>),
+// }
+// impl<T: AmDist> LamellarHandle<T> {
+//     pub fn blocking_wait(&self) -> T {
+//         match self {
+//             LamellarHandle::SinglePeAm(h) => h.blocking_wait(),
+//         }
+//     }
+
+//     pub fn ready(&self) -> bool {
+//         match self {
+//             LamellarHandle::SinglePeAm(h) => h.ready(),
+//         }
+//     }
+// }
+
+// impl<T> Future for LamellarHandle<T> {
+//     type Output = T;
+//     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+//         match self {
+//             LamellarHandle::SinglePeAm(h) => h.poll(cx),
+//         }
+//     }
+// }
+
+// pub(crate) mod private {
+//     use crate::active_messaging::handle::AmHandle;
+//     use crate::lamellar_request::LamellarHandle;
+//     use enum_dispatch::enum_dispatch;
+//     use futures_util::Future;
+//     use std::task::Waker;
+
+//     #[enum_dispatch(LamellarHandle<T>)]
+//     pub trait LamellarRequestSealed:  {
+
+//     }
+// }
 
 #[doc(hidden)]
-#[async_trait]
-pub trait LamellarMultiRequest: Sync + Send {
-    type Output;
-    async fn into_future(mut self: Box<Self>) -> Vec<Self::Output>;
-    fn get(&self) -> Vec<Self::Output>;
+// #[enum_dispatch]
+pub(crate) trait LamellarRequest: Future {
+    fn blocking_wait(self) -> Self::Output;
+    // fn ready(&self) -> bool;
+    // fn set_waker(&mut self, waker: &Waker);
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool;
+    fn val(&self) -> Self::Output;
 }
 
+// #[doc(hidden)]
+// #[async_trait]
+// pub trait LamellarMultiRequest: Sync + Send {
+//     type Output;
+//     async fn into_future(mut self: Box<Self>) -> Vec<Self::Output>;
+//     fn blocking_wait(&self) -> Vec<Self::Output>;
+// }
+
 pub(crate) trait LamellarRequestAddResult: Sync + Send {
     fn user_held(&self) -> bool;
     fn add_result(&self, pe: usize, sub_id: usize, data: InternalResult);
     fn update_counters(&self);
 }
 
+pub(crate) enum LamellarRequestResult {
+    Am(Arc<AmHandleInner>),
+    MultiAm(Arc<MultiAmHandleInner>),
+    TgAm(Arc<TaskGroupAmHandleInner>),
+    TgMultiAm(Arc<TaskGroupMultiAmHandleInner>),
+}
 //todo make this an enum instead...
 // will need to include the task group requests as well...
-pub(crate) struct LamellarRequestResult {
-    pub(crate) req: Arc<dyn LamellarRequestAddResult>,
-}
+// pub(crate) struct LamellarRequestResult {
+//     pub(crate) req: Arc<dyn LamellarRequestAddResult>,
+// }
 impl std::fmt::Debug for LamellarRequestResult {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "LamellarRequestResult")
+        match self {
+            Self::Am(_) => write!(f, "Am"),
+            Self::MultiAm(_) => write!(f, "MultiAm"),
+            Self::TgAm(_) => write!(f, "TgAm"),
+            Self::TgMultiAm(_) => write!(f, "TgMultiAm"),
+        }
     }
 }
 
 impl LamellarRequestResult {
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn add_result(&self, pe: usize, sub_id: usize, data: InternalResult) -> bool {
+    pub(crate) fn add_result_inner<T: LamellarRequestAddResult>(
+        req: &Arc<T>,
+        pe: usize,
+        sub_id: usize,
+        data: InternalResult,
+    ) -> bool {
         let mut added = false;
 
-        if self.req.user_held() {
-            self.req.add_result(pe as usize, sub_id, data);
+        if req.user_held() {
+            req.add_result(pe as usize, sub_id, data);
             added = true;
         } else {
             // if the user dopped the handle we still need to handle if Darcs are returned
@@ -83,401 +137,491 @@ impl LamellarRequestResult {
                 }
             }
         }
-
-        self.req.update_counters();
-
+        req.update_counters();
         added
     }
-}
-
-pub(crate) struct LamellarRequestHandleInner {
-    pub(crate) ready: AtomicBool,
-    pub(crate) waker: Mutex<Option<Waker>>,
-    pub(crate) data: Cell<Option<InternalResult>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
-    pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
-    pub(crate) scheduler: Arc<Scheduler>,
-    pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
-}
-impl std::fmt::Debug for LamellarRequestHandleInner {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "LamellarRequestHandleInner {{ ready: {:?}, team_outstanding_reqs: {:?}  world_outstanding_reqs {:?} tg_outstanding_reqs {:?} user_handle{:?} }}", self.ready.load(Ordering::Relaxed),  self.team_outstanding_reqs.load(Ordering::Relaxed), self.world_outstanding_reqs.load(Ordering::Relaxed), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)), self.user_handle.load(Ordering::Relaxed))
-    }
-}
-// we use the ready bool to protect access to the data field
-unsafe impl Sync for LamellarRequestHandleInner {}
-
-#[doc(hidden)]
-#[derive(Debug)]
-pub struct LamellarRequestHandle<T: AmDist> {
-    pub(crate) inner: Arc<LamellarRequestHandleInner>,
-    pub(crate) _phantom: std::marker::PhantomData<T>,
-}
-
-impl<T: AmDist> Drop for LamellarRequestHandle<T> {
-    //#[tracing::instrument(skip_all)]
-    fn drop(&mut self) {
-        self.inner.user_handle.store(false, Ordering::SeqCst);
-    }
-}
-
-impl LamellarRequestAddResult for LamellarRequestHandleInner {
-    //#[tracing::instrument(skip_all)]
-    fn user_held(&self) -> bool {
-        self.user_handle.load(Ordering::SeqCst)
-    }
-    //#[tracing::instrument(skip_all)]
-    fn add_result(&self, _pe: usize, _sub_id: usize, data: InternalResult) {
-        // for a single request this is only called one time by a single runtime thread so use of the cell is safe
-        self.data.set(Some(data));
-        self.ready.store(true, Ordering::SeqCst);
-        if let Some(waker) = self.waker.lock().take() {
-            waker.wake();
-        }
-    }
-    //#[tracing::instrument(skip_all)]
-    fn update_counters(&self) {
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!(
-        //     "[{:?}] update counter team {} world {}",
-        //     std::thread::current().id(),
-        //     _team_reqs - 1,
-        //     _world_req - 1
-        // );
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        }
-    }
-}
-
-impl<T: AmDist> LamellarRequestHandle<T> {
-    //#[tracing::instrument(skip_all)]
-    fn process_result(&self, data: InternalResult) -> T {
-        match data {
-            InternalResult::Local(x) => {
-                if let Ok(result) = x.downcast::<T>() {
-                    *result
-                } else {
-                    panic!("unexpected local result  of type ");
-                }
-            }
-            InternalResult::Remote(x, darcs) => {
-                if let Ok(result) = x.deserialize_data::<T>() {
-                    // we need to appropraiately set the reference counts if the returned data contains any Darcs
-                    // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
-                    // that the reference count is updated.
-                    for darc in darcs {
-                        match darc {
-                            RemotePtr::NetworkDarc(darc) => {
-                                let temp: Darc<()> = darc.into();
-                                temp.des(Ok(0));
-                                temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
-                            }
-                            RemotePtr::NetMemRegionHandle(mr) => {
-                                let temp: Arc<MemRegionHandleInner> = mr.into();
-                                temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
-                            }
-                        }
-                    }
-
-                    result
-                } else {
-                    panic!("unexpected remote result  of type ");
-                }
-            }
-            InternalResult::Unit => {
-                if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
-                    *result
-                } else {
-                    panic!("unexpected unit result  of type ");
-                }
-            }
-        }
-    }
-}
-
-#[async_trait]
-impl<T: AmDist> LamellarRequest for LamellarRequestHandle<T> {
-    type Output = T;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        while !self.inner.ready.load(Ordering::SeqCst) {
-            async_std::task::yield_now().await;
-        }
-        self.process_result(self.inner.data.replace(None).expect("result should exist"))
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> T {
-        while !self.inner.ready.load(Ordering::SeqCst) {
-            // std::thread::yield_now();
-            self.inner.scheduler.exec_task();
-        }
-        self.process_result(self.inner.data.replace(None).expect("result should exist"))
-    }
 
-    fn ready(&self) -> bool {
-        self.inner.ready.load(Ordering::SeqCst)
-    }
-
-    fn set_waker(&mut self, waker: Waker) {
-        *self.inner.waker.lock() = Some(waker);
-    }
-}
-
-#[derive(Debug)]
-pub(crate) struct LamellarMultiRequestHandleInner {
-    pub(crate) cnt: AtomicUsize,
-    pub(crate) arch: Arc<LamellarArchRT>,
-    pub(crate) data: Mutex<HashMap<usize, InternalResult>>,
-    pub(crate) waker: Mutex<Option<Waker>>,
-    pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
-    pub(crate) scheduler: Arc<Scheduler>,
-    pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
-}
-
-#[doc(hidden)]
-#[derive(Debug)]
-pub struct LamellarMultiRequestHandle<T: AmDist> {
-    pub(crate) inner: Arc<LamellarMultiRequestHandleInner>,
-    pub(crate) _phantom: std::marker::PhantomData<T>,
-}
-
-impl<T: AmDist> Drop for LamellarMultiRequestHandle<T> {
-    //#[tracing::instrument(skip_all)]
-    fn drop(&mut self) {
-        self.inner.user_handle.store(false, Ordering::SeqCst);
-    }
-}
-
-impl LamellarRequestAddResult for LamellarMultiRequestHandleInner {
-    //#[tracing::instrument(skip_all)]
-    fn user_held(&self) -> bool {
-        self.user_handle.load(Ordering::SeqCst)
-    }
-    //#[tracing::instrument(skip_all)]
-    fn add_result(&self, pe: usize, _sub_id: usize, data: InternalResult) {
-        let pe = self.arch.team_pe(pe).expect("pe does not exist on team");
-        self.data.lock().insert(pe, data);
-        self.cnt.fetch_sub(1, Ordering::SeqCst);
-        if self.cnt.load(Ordering::SeqCst) == 0 {
-            if let Some(waker) = self.waker.lock().take() {
-                waker.wake();
-            }
-        }
-    }
-    //#[tracing::instrument(skip_all)]
-    fn update_counters(&self) {
-        // println!(
-        //     "update counter {:?} {:?}",
-        //     self.team_outstanding_reqs.load(Ordering::SeqCst),
-        //     self.world_outstanding_reqs.load(Ordering::SeqCst)
-        // );
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!(
-        //     "[{:?}] multi update counter team {} world {}",
-        //     std::thread::current().id(),
-        //     _team_reqs - 1,
-        //     _world_req - 1
-        // );
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        }
-    }
-}
-
-impl<T: AmDist> LamellarMultiRequestHandle<T> {
-    //#[tracing::instrument(skip_all)]
-    fn process_result(&self, data: InternalResult) -> T {
-        match data {
-            InternalResult::Local(x) => {
-                if let Ok(result) = x.downcast::<T>() {
-                    *result
-                } else {
-                    panic!("unexpected local result  of type ");
-                }
-            }
-            InternalResult::Remote(x, darcs) => {
-                if let Ok(result) = x.deserialize_data::<T>() {
-                    // we need to appropraiately set the reference counts if the returned data contains any Darcs
-                    // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
-                    // that the reference count is updated.
-                    for darc in darcs {
-                        match darc {
-                            RemotePtr::NetworkDarc(darc) => {
-                                let temp: Darc<()> = darc.into();
-                                temp.des(Ok(0));
-                                temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
-                            }
-                            RemotePtr::NetMemRegionHandle(mr) => {
-                                let temp: Arc<MemRegionHandleInner> = mr.into();
-                                temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
-                            }
-                        }
-                    }
-                    result
-                } else {
-                    panic!("unexpected remote result  of type ");
-                }
-            }
-            InternalResult::Unit => {
-                if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
-                    *result
-                } else {
-                    panic!("unexpected unit result  of type ");
-                }
-            }
-        }
-    }
-}
-
-#[async_trait]
-impl<T: AmDist> LamellarMultiRequest for LamellarMultiRequestHandle<T> {
-    type Output = T;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Vec<Self::Output> {
-        while self.inner.cnt.load(Ordering::SeqCst) > 0 {
-            async_std::task::yield_now().await;
-        }
-        let mut res = vec![];
-        let mut data = self.inner.data.lock();
-        // println!("data len{:?}", data.len());
-        for pe in 0..data.len() {
-            res.push(self.process_result(data.remove(&pe).expect("result should exist")));
-        }
-        res
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> Vec<T> {
-        while self.inner.cnt.load(Ordering::SeqCst) > 0 {
-            // std::thread::yield_now();
-            self.inner.scheduler.exec_task();
-        }
-        let mut res = vec![];
-        let mut data = self.inner.data.lock();
-        for pe in 0..data.len() {
-            res.push(self.process_result(data.remove(&pe).expect("result should exist")));
-        }
-        res
-    }
-}
-
-pub(crate) struct LamellarLocalRequestHandleInner {
-    // pub(crate) ready: AtomicBool,
-    pub(crate) ready: (Mutex<bool>, Condvar),
-    pub(crate) waker: Mutex<Option<Waker>>,
-    pub(crate) data: Cell<Option<LamellarAny>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
-    pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
-    pub(crate) scheduler: Arc<Scheduler>,
-    pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
-}
-
-impl std::fmt::Debug for LamellarLocalRequestHandleInner {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "LamellarLocalRequestHandleInner {{ ready: {:?}, team_outstanding_reqs {:?}, world_outstanding_reqs {:?}, tg_outstanding_reqs{:?}, user_handle {:?}}}", self.ready.0.lock(), self.team_outstanding_reqs.load(Ordering::SeqCst), self.world_outstanding_reqs.load(Ordering::SeqCst), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::SeqCst)), self.user_handle.load(Ordering::SeqCst))
-    }
-}
-
-// we use the ready bool to protect access to the data field
-unsafe impl Sync for LamellarLocalRequestHandleInner {}
-
-#[doc(hidden)]
-#[derive(Debug)]
-pub struct LamellarLocalRequestHandle<T> {
-    pub(crate) inner: Arc<LamellarLocalRequestHandleInner>,
-    pub(crate) _phantom: std::marker::PhantomData<T>,
-}
-
-impl<T> Drop for LamellarLocalRequestHandle<T> {
-    //#[tracing::instrument(skip_all)]
-    fn drop(&mut self) {
-        self.inner.user_handle.store(false, Ordering::SeqCst);
-    }
-}
-
-impl LamellarRequestAddResult for LamellarLocalRequestHandleInner {
-    //#[tracing::instrument(skip_all)]
-    fn user_held(&self) -> bool {
-        self.user_handle.load(Ordering::SeqCst)
-    }
-    //#[tracing::instrument(skip_all)]
-    fn add_result(&self, _pe: usize, _sub_id: usize, data: InternalResult) {
-        // for a single request this is only called one time by a single runtime thread so use of the cell is safe
-        match data {
-            InternalResult::Local(x) => self.data.set(Some(x)),
-            InternalResult::Remote(_, _) => panic!("unexpected local result  of type "),
-            InternalResult::Unit => self.data.set(Some(Box::new(()) as LamellarAny)),
-        }
-
-        // self.ready.store(true, Ordering::SeqCst);
-        *self.ready.0.lock() = true;
-        self.ready.1.notify_one();
-        if let Some(waker) = self.waker.lock().take() {
-            waker.wake();
-        }
-    }
-    //#[tracing::instrument(skip_all)]
-    fn update_counters(&self) {
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!(
-        //     "[{:?}] local update counter team {} world {}",
-        //     std::thread::current().id(),
-        //     _team_reqs - 1,
-        //     _world_req - 1
-        // );
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        }
-    }
-}
-
-impl<T: 'static> LamellarLocalRequestHandle<T> {
-    //#[tracing::instrument(skip_all)]
-    fn process_result(&self, data: LamellarAny) -> T {
-        if let Ok(result) = data.downcast::<T>() {
-            *result
-        } else {
-            panic!("unexpected local result  of type ");
+    pub(crate) fn add_result(&self, pe: usize, sub_id: usize, data: InternalResult) -> bool {
+        match self {
+            Self::Am(req) => Self::add_result_inner(req, pe, sub_id, data),
+            Self::MultiAm(req) => Self::add_result_inner(req, pe, sub_id, data),
+            Self::TgAm(req) => Self::add_result_inner(req, pe, sub_id, data),
+            Self::TgMultiAm(req) => Self::add_result_inner(req, pe, sub_id, data),
         }
     }
 }
 
-#[async_trait]
-impl<T: SyncSend + 'static> LamellarRequest for LamellarLocalRequestHandle<T> {
-    type Output = T;
-    //#[tracing::instrument(skip_all)]
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
-        while !*self.inner.ready.0.lock() {
-            async_std::task::yield_now().await;
-        }
-        self.process_result(self.inner.data.replace(None).expect("result should exist"))
-    }
-    //#[tracing::instrument(skip_all)]
-    fn get(&self) -> T {
-        // let mut ready_lock = self.inner.ready.0.lock();
-        // while !*ready_lock {
-        while !*self.inner.ready.0.lock() {
-            // std::thread::yield_now();
-            // self.inner.ready.1.wait(&mut ready_lock);
-            self.inner.scheduler.exec_task();
-        }
-        self.process_result(self.inner.data.replace(None).expect("result should exist"))
-    }
-
-    fn ready(&self) -> bool {
-        let ready = *self.inner.ready.0.lock();
-        // println!("ready: {}", ready);
-        ready
-    }
-
-    fn set_waker(&mut self, waker: Waker) {
-        *self.inner.waker.lock() = Some(waker);
-    }
-}
+// #[derive(Debug)]
+// pub struct LamellarHandle<T: AmDist> {
+//     pub(crate) inner: Arc<LamellarRequestType<T>>,
+// }
+
+// impl<T: AmDist> Drop for LamellarHandle<T> {
+//     fn drop(&mut self) {
+//         self.inner.user_handle.store(false, Ordering::SeqCst);
+//     }
+// }
+
+// impl<T: AmDist> LamellarHandle<T> {
+//     fn process_result(&self, data: InternalResult) -> T {
+//         match data {
+//             InternalResult::Local(x) => {
+//                 if let Ok(result) = x.downcast::<T>() {
+//                     *result
+//                 } else {
+//                     panic!("unexpected local result  of type ");
+//                 }
+//             }
+//             InternalResult::Remote(x, darcs) => {
+//                 if let Ok(result) = x.deserialize_data::<T>() {
+//                     // we need to appropraiately set the reference counts if the returned data contains any Darcs
+//                     // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
+//                     // that the reference count is updated.
+//                     for darc in darcs {
+//                         match darc {
+//                             RemotePtr::NetworkDarc(darc) => {
+//                                 let temp: Darc<()> = darc.into();
+//                                 temp.des(Ok(0));
+//                                 temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
+//                             }
+//                             RemotePtr::NetMemRegionHandle(mr) => {
+//                                 let temp: Arc<MemRegionHandleInner> = mr.into();
+//                                 temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
+//                             }
+//                         }
+//                     }
+
+//                     result
+//                 } else {
+//                     panic!("unexpected remote result  of type ");
+//                 }
+//             }
+//             InternalResult::Unit => {
+//                 if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
+//                     *result
+//                 } else {
+//                     panic!("unexpected unit result  of type ");
+//                 }
+//             }
+//         }
+//     }
+// }
+
+// impl<T: AmDist> private::LamellarRequestSealed for LamellarHandle<T> {
+//     fn set_waker(&mut self, waker: &Waker) {
+//         self.inner.set_waker(waker);
+//     }
+
+//     fn val(&self) -> Self::Output {
+//         self.inner.val();
+//     }
+// }
+
+// impl<T: AmDist> LamellarRequest for LamellarHandle<T> {
+//     fn blocking_wait(&self) -> T {
+//         self.inner.blocking_wait()
+//     }
+
+//     fn ready(&self) -> bool {
+//         self.inner.read()
+//     }
+// }
+
+// impl<T: AmDist> Future for LamellarHandle<T> {
+//     type Output = T;
+//     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+//         self.inner.poll(cx)
+//     }
+// }
+
+// pub(crate) struct LamellarRequestHandleInner {
+//     pub(crate) ready: AtomicBool,
+//     pub(crate) waker: Mutex<Option<Waker>>,
+//     pub(crate) data: Cell<Option<InternalResult>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
+//     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
+//     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
+//     pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+//     pub(crate) scheduler: Arc<Scheduler>,
+//     pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
+// }
+// impl std::fmt::Debug for LamellarRequestHandleInner {
+//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+//         write!(f, "LamellarRequestHandleInner {{ ready: {:?}, team_outstanding_reqs: {:?}  world_outstanding_reqs {:?} tg_outstanding_reqs {:?} user_handle{:?} }}", self.ready.load(Ordering::Relaxed),  self.team_outstanding_reqs.load(Ordering::Relaxed), self.world_outstanding_reqs.load(Ordering::Relaxed), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)), self.user_handle.load(Ordering::Relaxed))
+//     }
+// }
+// // we use the ready bool to protect access to the data field
+// unsafe impl Sync for LamellarRequestHandleInner {}
+
+// #[doc(hidden)]
+// #[derive(Debug)]
+// pub struct LamellarRequestHandle<T: AmDist> {
+//     pub(crate) inner: Arc<LamellarRequestHandleInner>,
+//     pub(crate) _phantom: std::marker::PhantomData<T>,
+// }
+
+// impl<T: AmDist> Drop for LamellarRequestHandle<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn drop(&mut self) {
+//         self.inner.user_handle.store(false, Ordering::SeqCst);
+//     }
+// }
+
+// impl LamellarRequestAddResult for LamellarRequestHandleInner {
+//     //#[tracing::instrument(skip_all)]
+//     fn user_held(&self) -> bool {
+//         self.user_handle.load(Ordering::SeqCst)
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn add_result(&self, _pe: usize, _sub_id: usize, data: InternalResult) {
+//         // for a single request this is only called one time by a single runtime thread so use of the cell is safe
+//         self.data.set(Some(data));
+//         self.ready.store(true, Ordering::SeqCst);
+//         if let Some(waker) = self.waker.lock().take() {
+//             waker.wake();
+//         }
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn update_counters(&self) {
+//         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         // println!(
+//         //     "[{:?}] update counter team {} world {}",
+//         //     std::thread::current().id(),
+//         //     _team_reqs - 1,
+//         //     _world_req - 1
+//         // );
+//         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
+//             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         }
+//     }
+// }
+
+// impl<T: AmDist> LamellarRequestHandle<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn process_result(&self, data: InternalResult) -> T {
+//         match data {
+//             InternalResult::Local(x) => {
+//                 if let Ok(result) = x.downcast::<T>() {
+//                     *result
+//                 } else {
+//                     panic!("unexpected local result  of type ");
+//                 }
+//             }
+//             InternalResult::Remote(x, darcs) => {
+//                 if let Ok(result) = x.deserialize_data::<T>() {
+//                     // we need to appropraiately set the reference counts if the returned data contains any Darcs
+//                     // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
+//                     // that the reference count is updated.
+//                     for darc in darcs {
+//                         match darc {
+//                             RemotePtr::NetworkDarc(darc) => {
+//                                 let temp: Darc<()> = darc.into();
+//                                 temp.des(Ok(0));
+//                                 temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
+//                             }
+//                             RemotePtr::NetMemRegionHandle(mr) => {
+//                                 let temp: Arc<MemRegionHandleInner> = mr.into();
+//                                 temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
+//                             }
+//                         }
+//                     }
+
+//                     result
+//                 } else {
+//                     panic!("unexpected remote result  of type ");
+//                 }
+//             }
+//             InternalResult::Unit => {
+//                 if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
+//                     *result
+//                 } else {
+//                     panic!("unexpected unit result  of type ");
+//                 }
+//             }
+//         }
+//     }
+// }
+
+// #[async_trait]
+// impl<T: AmDist> LamellarRequest for LamellarRequestHandle<T> {
+//     type Output = T;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         while !self.inner.ready.load(Ordering::SeqCst) {
+//             async_std::task::yield_now().await;
+//         }
+//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn blocking_wait(&self) -> T {
+//         while !self.inner.ready.load(Ordering::SeqCst) {
+//             // std::thread::yield_now();
+//             self.inner.scheduler.exec_task();
+//         }
+//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
+//     }
+
+//     fn ready(&self) -> bool {
+//         self.inner.ready.load(Ordering::SeqCst)
+//     }
+
+//     fn set_waker(&mut self, waker: &Waker) {
+//         *self.inner.waker.lock() = Some(waker);
+//     }
+// }
+
+// #[derive(Debug)]
+// pub(crate) struct LamellarMultiRequestHandleInner {
+//     pub(crate) cnt: AtomicUsize,
+//     pub(crate) arch: Arc<LamellarArchRT>,
+//     pub(crate) data: Mutex<HashMap<usize, InternalResult>>,
+//     pub(crate) waker: Mutex<Option<Waker>>,
+//     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
+//     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
+//     pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+//     pub(crate) scheduler: Arc<Scheduler>,
+//     pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
+// }
+
+// #[doc(hidden)]
+// #[derive(Debug)]
+// pub struct LamellarMultiRequestHandle<T: AmDist> {
+//     pub(crate) inner: Arc<LamellarMultiRequestHandleInner>,
+//     pub(crate) _phantom: std::marker::PhantomData<T>,
+// }
+
+// impl<T: AmDist> Drop for LamellarMultiRequestHandle<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn drop(&mut self) {
+//         self.inner.user_handle.store(false, Ordering::SeqCst);
+//     }
+// }
+
+// impl LamellarRequestAddResult for LamellarMultiRequestHandleInner {
+//     //#[tracing::instrument(skip_all)]
+//     fn user_held(&self) -> bool {
+//         self.user_handle.load(Ordering::SeqCst)
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn add_result(&self, pe: usize, _sub_id: usize, data: InternalResult) {
+//         let pe = self.arch.team_pe(pe).expect("pe does not exist on team");
+//         self.data.lock().insert(pe, data);
+//         self.cnt.fetch_sub(1, Ordering::SeqCst);
+//         if self.cnt.load(Ordering::SeqCst) == 0 {
+//             if let Some(waker) = self.waker.lock().take() {
+//                 waker.wake();
+//             }
+//         }
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn update_counters(&self) {
+//         // println!(
+//         //     "update counter {:?} {:?}",
+//         //     self.team_outstanding_reqs.load(Ordering::SeqCst),
+//         //     self.world_outstanding_reqs.load(Ordering::SeqCst)
+//         // );
+//         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         // println!(
+//         //     "[{:?}] multi update counter team {} world {}",
+//         //     std::thread::current().id(),
+//         //     _team_reqs - 1,
+//         //     _world_req - 1
+//         // );
+//         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
+//             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         }
+//     }
+// }
+
+// impl<T: AmDist> LamellarMultiRequestHandle<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn process_result(&self, data: InternalResult) -> T {
+//         match data {
+//             InternalResult::Local(x) => {
+//                 if let Ok(result) = x.downcast::<T>() {
+//                     *result
+//                 } else {
+//                     panic!("unexpected local result  of type ");
+//                 }
+//             }
+//             InternalResult::Remote(x, darcs) => {
+//                 if let Ok(result) = x.deserialize_data::<T>() {
+//                     // we need to appropraiately set the reference counts if the returned data contains any Darcs
+//                     // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
+//                     // that the reference count is updated.
+//                     for darc in darcs {
+//                         match darc {
+//                             RemotePtr::NetworkDarc(darc) => {
+//                                 let temp: Darc<()> = darc.into();
+//                                 temp.des(Ok(0));
+//                                 temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
+//                             }
+//                             RemotePtr::NetMemRegionHandle(mr) => {
+//                                 let temp: Arc<MemRegionHandleInner> = mr.into();
+//                                 temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
+//                             }
+//                         }
+//                     }
+//                     result
+//                 } else {
+//                     panic!("unexpected remote result  of type ");
+//                 }
+//             }
+//             InternalResult::Unit => {
+//                 if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
+//                     *result
+//                 } else {
+//                     panic!("unexpected unit result  of type ");
+//                 }
+//             }
+//         }
+//     }
+// }
+
+// #[async_trait]
+// impl<T: AmDist> LamellarMultiRequest for LamellarMultiRequestHandle<T> {
+//     type Output = T;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Vec<Self::Output> {
+//         while self.inner.cnt.load(Ordering::SeqCst) > 0 {
+//             async_std::task::yield_now().await;
+//         }
+//         let mut res = vec![];
+//         let mut data = self.inner.data.lock();
+//         // println!("data len{:?}", data.len());
+//         for pe in 0..data.len() {
+//             res.push(self.process_result(data.remove(&pe).expect("result should exist")));
+//         }
+//         res
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn blocking_wait(&self) -> Vec<T> {
+//         while self.inner.cnt.load(Ordering::SeqCst) > 0 {
+//             // std::thread::yield_now();
+//             self.inner.scheduler.exec_task();
+//         }
+//         let mut res = vec![];
+//         let mut data = self.inner.data.lock();
+//         for pe in 0..data.len() {
+//             res.push(self.process_result(data.remove(&pe).expect("result should exist")));
+//         }
+//         res
+//     }
+// }
+
+// pub(crate) struct LamellarLocalRequestHandleInner {
+//     // pub(crate) ready: AtomicBool,
+//     pub(crate) ready: (Mutex<bool>, Condvar),
+//     pub(crate) waker: Mutex<Option<Waker>>,
+//     pub(crate) data: Cell<Option<LamellarAny>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
+//     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
+//     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
+//     pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+//     pub(crate) scheduler: Arc<Scheduler>,
+//     pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
+// }
+
+// impl std::fmt::Debug for LamellarLocalRequestHandleInner {
+//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+//         write!(f, "LamellarLocalRequestHandleInner {{ ready: {:?}, team_outstanding_reqs {:?}, world_outstanding_reqs {:?}, tg_outstanding_reqs{:?}, user_handle {:?}}}", self.ready.0.lock(), self.team_outstanding_reqs.load(Ordering::SeqCst), self.world_outstanding_reqs.load(Ordering::SeqCst), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::SeqCst)), self.user_handle.load(Ordering::SeqCst))
+//     }
+// }
+
+// // we use the ready bool to protect access to the data field
+// unsafe impl Sync for LamellarLocalRequestHandleInner {}
+
+// #[doc(hidden)]
+// #[derive(Debug)]
+// pub struct LamellarLocalRequestHandle<T> {
+//     pub(crate) inner: Arc<LamellarLocalRequestHandleInner>,
+//     pub(crate) _phantom: std::marker::PhantomData<T>,
+// }
+
+// impl<T> Drop for LamellarLocalRequestHandle<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn drop(&mut self) {
+//         self.inner.user_handle.store(false, Ordering::SeqCst);
+//     }
+// }
+
+// impl LamellarRequestAddResult for LamellarLocalRequestHandleInner {
+//     //#[tracing::instrument(skip_all)]
+//     fn user_held(&self) -> bool {
+//         self.user_handle.load(Ordering::SeqCst)
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn add_result(&self, _pe: usize, _sub_id: usize, data: InternalResult) {
+//         // for a single request this is only called one time by a single runtime thread so use of the cell is safe
+//         match data {
+//             InternalResult::Local(x) => self.data.set(Some(x)),
+//             InternalResult::Remote(_, _) => panic!("unexpected local result  of type "),
+//             InternalResult::Unit => self.data.set(Some(Box::new(()) as LamellarAny)),
+//         }
+
+//         // self.ready.store(true, Ordering::SeqCst);
+//         *self.ready.0.lock() = true;
+//         self.ready.1.notify_one();
+//         if let Some(waker) = self.waker.lock().take() {
+//             waker.wake();
+//         }
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn update_counters(&self) {
+//         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         // println!(
+//         //     "[{:?}] local update counter team {} world {}",
+//         //     std::thread::current().id(),
+//         //     _team_reqs - 1,
+//         //     _world_req - 1
+//         // );
+//         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
+//             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+//         }
+//     }
+// }
+
+// impl<T: 'static> LamellarLocalRequestHandle<T> {
+//     //#[tracing::instrument(skip_all)]
+//     fn process_result(&self, data: LamellarAny) -> T {
+//         if let Ok(result) = data.downcast::<T>() {
+//             *result
+//         } else {
+//             panic!("unexpected local result  of type ");
+//         }
+//     }
+// }
+
+// #[async_trait]
+// impl<T: SyncSend + 'static> LamellarRequest for LamellarLocalRequestHandle<T> {
+//     type Output = T;
+//     //#[tracing::instrument(skip_all)]
+//     async fn into_future(mut self: Box<Self>) -> Self::Output {
+//         while !*self.inner.ready.0.lock() {
+//             async_std::task::yield_now().await;
+//         }
+//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
+//     }
+//     //#[tracing::instrument(skip_all)]
+//     fn blocking_wait(&self) -> T {
+//         // let mut ready_lock = self.inner.ready.0.lock();
+//         // while !*ready_lock {
+//         while !*self.inner.ready.0.lock() {
+//             // std::thread::yield_now();
+//             // self.inner.ready.1.wait(&mut ready_lock);
+//             self.inner.scheduler.exec_task();
+//         }
+//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
+//     }
+
+//     fn ready(&self) -> bool {
+//         let ready = *self.inner.ready.0.lock();
+//         // println!("ready: {}", ready);
+//         ready
+//     }
+
+//     fn set_waker(&mut self, waker: &Waker) {
+//         *self.inner.waker.lock() = Some(waker);
+//     }
+// }
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 5190aa6a..b718cabb 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -1,30 +1,31 @@
+use crate::active_messaging::registered_active_message::{AmId, AMS_EXECS, AMS_IDS, AM_ID_START};
 use crate::active_messaging::*;
 use crate::lamellae::Des;
 use crate::lamellar_arch::LamellarArchRT;
+use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_request::*;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::MemRegionHandleInner;
 use crate::scheduler::{ReqId, Scheduler};
 use crate::Darc;
 
-use crate::active_messaging::registered_active_message::{AmId, AMS_EXECS, AMS_IDS, AM_ID_START};
-
-use async_trait::async_trait;
-
 // use crossbeam::utils::CachePadded;
-use futures::Future;
-use futures::StreamExt;
+// use futures_util::StreamExt;
+
+use futures_util::{Future, StreamExt};
 use parking_lot::Mutex;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::{BTreeMap, HashMap};
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
-use std::task::Waker;
+use std::task::{Context, Poll, Waker};
 use std::time::Instant;
 
 #[derive(Debug)]
-pub(crate) struct TaskGroupRequestHandleInner {
+
+pub(crate) struct TaskGroupAmHandleInner {
     cnt: Arc<AtomicUsize>,
     data: Mutex<HashMap<usize, InternalResult>>, //<sub_id, result>
     wakers: Mutex<HashMap<usize, Waker>>,
@@ -36,19 +37,21 @@ pub(crate) struct TaskGroupRequestHandleInner {
 
 #[doc(hidden)]
 #[derive(Debug)]
-pub struct TaskGroupRequestHandle<T: AmDist> {
-    inner: Arc<TaskGroupRequestHandleInner>,
+#[pin_project(PinnedDrop)]
+pub struct TaskGroupAmHandle<T: AmDist> {
+    inner: Arc<TaskGroupAmHandleInner>,
     sub_id: usize,
     _phantom: std::marker::PhantomData<T>,
 }
 
-impl<T: AmDist> Drop for TaskGroupRequestHandle<T> {
-    fn drop(&mut self) {
+#[pinned_drop]
+impl<T: AmDist> PinnedDrop for TaskGroupAmHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
         self.inner.cnt.fetch_sub(1, Ordering::SeqCst);
     }
 }
 
-impl LamellarRequestAddResult for TaskGroupRequestHandleInner {
+impl LamellarRequestAddResult for TaskGroupAmHandleInner {
     fn user_held(&self) -> bool {
         self.cnt.load(Ordering::SeqCst) > 0
     }
@@ -68,7 +71,7 @@ impl LamellarRequestAddResult for TaskGroupRequestHandleInner {
     }
 }
 
-impl<T: AmDist> TaskGroupRequestHandle<T> {
+impl<T: AmDist> TaskGroupAmHandle<T> {
     fn process_result(&self, data: InternalResult) -> T {
         match data {
             InternalResult::Local(x) => {
@@ -112,39 +115,71 @@ impl<T: AmDist> TaskGroupRequestHandle<T> {
     }
 }
 
-#[async_trait]
-impl<T: AmDist> LamellarRequest for TaskGroupRequestHandle<T> {
-    type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
+impl<T: AmDist> LamellarRequest for TaskGroupAmHandle<T> {
+    fn blocking_wait(self) -> Self::Output {
         let mut res = self.inner.data.lock().remove(&self.sub_id);
         while res.is_none() {
-            async_std::task::yield_now().await;
+            self.inner.scheduler.exec_task();
             res = self.inner.data.lock().remove(&self.sub_id);
         }
         self.process_result(res.expect("result should exist"))
     }
 
-    fn get(&self) -> Self::Output {
-        let mut res = self.inner.data.lock().remove(&self.sub_id);
-        while res.is_none() {
-            // std::thread::yield_now();
-            self.inner.scheduler.exec_task();
-            res = self.inner.data.lock().remove(&self.sub_id);
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let data = self.inner.data.lock();
+        if data.contains_key(&self.sub_id) {
+            true
+        } else {
+            self.inner.wakers.lock().insert(self.sub_id, waker.clone());
+            self.inner
+                .wakers
+                .lock()
+                .entry(self.sub_id)
+                .and_modify(|w| {
+                    if !w.will_wake(waker) {
+                        println!("WARNING: overwriting waker {:?}", w);
+                        w.wake_by_ref();
+                    }
+                    w.clone_from(waker);
+                })
+                .or_insert(waker.clone());
+            false
         }
-        self.process_result(res.expect("result should exist"))
     }
 
-    fn ready(&self) -> bool {
-        self.inner.data.lock().contains_key(&self.sub_id)
+    fn val(&self) -> Self::Output {
+        let res = self
+            .inner
+            .data
+            .lock()
+            .remove(&self.sub_id)
+            .expect("result should exist");
+        self.process_result(res)
     }
+}
 
-    fn set_waker(&mut self, waker: std::task::Waker) {
-        self.inner.wakers.lock().insert(self.sub_id, waker);
+impl<T: AmDist> Future for TaskGroupAmHandle<T> {
+    type Output = T;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.as_mut();
+        if this.ready_or_set_waker(cx.waker()) {
+            Poll::Ready(
+                this.process_result(
+                    this.inner
+                        .data
+                        .lock()
+                        .remove(&this.sub_id)
+                        .expect("result should exist"),
+                ),
+            )
+        } else {
+            Poll::Pending
+        }
     }
 }
 
 #[derive(Debug)]
-pub(crate) struct TaskGroupMultiRequestHandleInner {
+pub(crate) struct TaskGroupMultiAmHandleInner {
     cnt: Arc<AtomicUsize>,
     arch: Arc<LamellarArchRT>,
     data: Mutex<HashMap<usize, HashMap<usize, InternalResult>>>, //<sub_id, <pe, result>>
@@ -157,19 +192,21 @@ pub(crate) struct TaskGroupMultiRequestHandleInner {
 
 #[doc(hidden)]
 #[derive(Debug)]
-pub struct TaskGroupMultiRequestHandle<T: AmDist> {
-    inner: Arc<TaskGroupMultiRequestHandleInner>,
+#[pin_project(PinnedDrop)]
+pub struct TaskGroupMultiAmHandle<T: AmDist> {
+    inner: Arc<TaskGroupMultiAmHandleInner>,
     sub_id: usize,
     _phantom: std::marker::PhantomData<T>,
 }
 
-impl<T: AmDist> Drop for TaskGroupMultiRequestHandle<T> {
-    fn drop(&mut self) {
+#[pinned_drop]
+impl<T: AmDist> PinnedDrop for TaskGroupMultiAmHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
         self.inner.cnt.fetch_sub(1, Ordering::SeqCst);
     }
 }
 
-impl LamellarRequestAddResult for TaskGroupMultiRequestHandleInner {
+impl LamellarRequestAddResult for TaskGroupMultiAmHandleInner {
     fn user_held(&self) -> bool {
         self.cnt.load(Ordering::SeqCst) > 0
     }
@@ -194,7 +231,7 @@ impl LamellarRequestAddResult for TaskGroupMultiRequestHandleInner {
     }
 }
 
-impl<T: AmDist> TaskGroupMultiRequestHandle<T> {
+impl<T: AmDist> TaskGroupMultiAmHandle<T> {
     fn process_result(&self, data: InternalResult) -> T {
         match data {
             InternalResult::Local(x) => {
@@ -238,12 +275,10 @@ impl<T: AmDist> TaskGroupMultiRequestHandle<T> {
     }
 }
 
-#[async_trait]
-impl<T: AmDist> LamellarMultiRequest for TaskGroupMultiRequestHandle<T> {
-    type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Vec<Self::Output> {
+impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
+    fn blocking_wait(self) -> Self::Output {
         while !self.inner.data.lock().contains_key(&self.sub_id) {
-            async_std::task::yield_now().await;
+            self.inner.scheduler.exec_task();
         }
         while self
             .inner
@@ -254,7 +289,7 @@ impl<T: AmDist> LamellarMultiRequest for TaskGroupMultiRequestHandle<T> {
             .len()
             < self.inner.arch.num_pes()
         {
-            async_std::task::yield_now().await;
+            self.inner.scheduler.exec_task();
         }
         let mut sub_id_map = self
             .inner
@@ -269,23 +304,29 @@ impl<T: AmDist> LamellarMultiRequest for TaskGroupMultiRequestHandle<T> {
         res
     }
 
-    fn get(&self) -> Vec<Self::Output> {
-        while !self.inner.data.lock().contains_key(&self.sub_id) {
-            self.inner.scheduler.exec_task();
-            // std::thread::yield_now();
-        }
-        while self
-            .inner
-            .data
-            .lock()
-            .get(&self.sub_id)
-            .expect("req sub id should exist")
-            .len()
-            < self.inner.arch.num_pes()
-        {
-            self.inner.scheduler.exec_task();
-            // std::thread::yield_now();
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let data = self.inner.data.lock();
+        if let Some(req) = data.get(&self.sub_id) {
+            req.len() == self.inner.arch.num_pes()
+        } else {
+            self.inner.wakers.lock().insert(self.sub_id, waker.clone());
+            self.inner
+                .wakers
+                .lock()
+                .entry(self.sub_id)
+                .and_modify(|w| {
+                    if !w.will_wake(waker) {
+                        println!("WARNING: overwriting waker {:?}", w);
+                        w.wake_by_ref();
+                    }
+                    w.clone_from(waker);
+                })
+                .or_insert(waker.clone());
+            false
         }
+    }
+
+    fn val(&self) -> Self::Output {
         let mut sub_id_map = self
             .inner
             .data
@@ -300,93 +341,118 @@ impl<T: AmDist> LamellarMultiRequest for TaskGroupMultiRequestHandle<T> {
     }
 }
 
-#[derive(Debug)]
-pub(crate) struct TaskGroupLocalRequestHandleInner {
-    cnt: Arc<AtomicUsize>,
-    data: Mutex<HashMap<usize, LamellarAny>>, //<sub_id, result>
-    wakers: Mutex<HashMap<usize, Waker>>,
-    team_outstanding_reqs: Arc<AtomicUsize>,
-    world_outstanding_reqs: Arc<AtomicUsize>,
-    tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
-    pub(crate) scheduler: Arc<Scheduler>,
+impl<T: AmDist> Future for TaskGroupMultiAmHandle<T> {
+    type Output = Vec<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.as_mut();
+        if this.ready_or_set_waker(cx.waker()) {
+            let mut sub_id_map = this
+                .inner
+                .data
+                .lock()
+                .remove(&this.sub_id)
+                .expect("req sub id should exist");
+            let mut res = Vec::new();
+            for pe in 0..sub_id_map.len() {
+                res.push(this.process_result(sub_id_map.remove(&pe).unwrap()));
+            }
+            Poll::Ready(res)
+        } else {
+            Poll::Pending
+        }
+    }
 }
 
 #[doc(hidden)]
 #[derive(Debug)]
-pub struct TaskGroupLocalRequestHandle<T> {
-    inner: Arc<TaskGroupLocalRequestHandleInner>,
+#[pin_project(PinnedDrop)]
+pub struct TaskGroupLocalAmHandle<T> {
+    inner: Arc<TaskGroupAmHandleInner>,
     sub_id: usize,
     _phantom: std::marker::PhantomData<T>,
 }
 
-impl<T> Drop for TaskGroupLocalRequestHandle<T> {
-    fn drop(&mut self) {
+#[pinned_drop]
+impl<T> PinnedDrop for TaskGroupLocalAmHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
         self.inner.cnt.fetch_sub(1, Ordering::SeqCst);
     }
 }
 
-impl LamellarRequestAddResult for TaskGroupLocalRequestHandleInner {
-    fn user_held(&self) -> bool {
-        self.cnt.load(Ordering::SeqCst) > 0
-    }
-    fn add_result(&self, _pe: usize, sub_id: usize, data: InternalResult) {
+impl<T: 'static> TaskGroupLocalAmHandle<T> {
+    fn process_result(&self, data: InternalResult) -> T {
         match data {
-            InternalResult::Local(x) => self.data.lock().insert(sub_id, x),
-            InternalResult::Remote(_, _) => panic!("unexpected result type"),
-            InternalResult::Unit => self.data.lock().insert(sub_id, Box::new(()) as LamellarAny),
-        };
-        if let Some(waker) = self.wakers.lock().remove(&sub_id) {
-            waker.wake();
-        }
-    }
-    fn update_counters(&self) {
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!("tg update counter team {} world {}",_team_reqs-1,_world_req-1);
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        }
-    }
-}
-
-impl<T: 'static> TaskGroupLocalRequestHandle<T> {
-    fn process_result(&self, data: LamellarAny) -> T {
-        if let Ok(result) = data.downcast::<T>() {
-            *result
-        } else {
-            panic!("unexpected result type");
+            InternalResult::Local(x) => {
+                if let Ok(result) = x.downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected result type");
+                }
+            }
+            InternalResult::Remote(_result, _darcs) => {
+                panic!("unexpected remote result  of type within local am handle");
+            }
+            InternalResult::Unit => {
+                if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
+                    *result
+                } else {
+                    panic!("unexpected unit result  of type ");
+                }
+            }
         }
     }
 }
 
-#[async_trait]
-impl<T: SyncSend + 'static> LamellarRequest for TaskGroupLocalRequestHandle<T> {
-    type Output = T;
-    async fn into_future(mut self: Box<Self>) -> Self::Output {
+impl<T: 'static> LamellarRequest for TaskGroupLocalAmHandle<T> {
+    fn blocking_wait(self) -> Self::Output {
         let mut res = self.inner.data.lock().remove(&self.sub_id);
         while res.is_none() {
-            async_std::task::yield_now().await;
+            self.inner.scheduler.exec_task();
             res = self.inner.data.lock().remove(&self.sub_id);
         }
-        self.process_result(res.unwrap())
+        self.process_result(res.expect("result should exist"))
     }
 
-    fn get(&self) -> Self::Output {
-        let mut res = self.inner.data.lock().remove(&self.sub_id);
-        while res.is_none() {
-            self.inner.scheduler.exec_task();
-            // std::thread::yield_now();
-            res = self.inner.data.lock().remove(&self.sub_id);
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let data = self.inner.data.lock();
+        if data.contains_key(&self.sub_id) {
+            true
+        } else {
+            //this can probably be optimized similar to set_waker of MultiAmHandle
+            // where we check if the waker already exists and if it wakes to same task
+            self.inner.wakers.lock().insert(self.sub_id, waker.clone());
+            false
         }
-        self.process_result(res.unwrap())
     }
 
-    fn ready(&self) -> bool {
-        self.inner.data.lock().contains_key(&self.sub_id)
+    fn val(&self) -> Self::Output {
+        let res = self
+            .inner
+            .data
+            .lock()
+            .remove(&self.sub_id)
+            .expect("result should exist");
+        self.process_result(res)
     }
+}
 
-    fn set_waker(&mut self, waker: futures::task::Waker) {
-        self.inner.wakers.lock().insert(self.sub_id, waker);
+impl<T: 'static> Future for TaskGroupLocalAmHandle<T> {
+    type Output = T;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.as_mut();
+        if this.ready_or_set_waker(cx.waker()) {
+            Poll::Ready(
+                this.process_result(
+                    this.inner
+                        .data
+                        .lock()
+                        .remove(&this.sub_id)
+                        .expect("result should exist"),
+                ),
+            )
+        } else {
+            Poll::Pending
+        }
     }
 }
 
@@ -448,9 +514,9 @@ pub struct LamellarTaskGroup {
     cnt: Arc<AtomicUsize>, // handle reference count, so that we don't need to worry about storing results if all handles are dropped
     pub(crate) counters: AMCounters,
     //these are cloned and returned to user for each request
-    req: Arc<TaskGroupRequestHandleInner>,
-    multi_req: Arc<TaskGroupMultiRequestHandleInner>,
-    local_req: Arc<TaskGroupLocalRequestHandleInner>,
+    req: Arc<TaskGroupAmHandleInner>,
+    multi_req: Arc<TaskGroupMultiAmHandleInner>,
+    local_req: Arc<TaskGroupAmHandleInner>,
     //these are cloned and passed to RT for each request (they wrap the above requests)
     rt_req: Arc<LamellarRequestResult>, //for exec_pe requests
     rt_multi_req: Arc<LamellarRequestResult>, //for exec_all requests
@@ -458,6 +524,10 @@ pub struct LamellarTaskGroup {
 }
 
 impl ActiveMessaging for LamellarTaskGroup {
+    type SinglePeAmHandle<R: AmDist> = TaskGroupAmHandle<R>;
+    type MultiAmHandle<R: AmDist> = TaskGroupMultiAmHandle<R>;
+    type LocalAmHandle<L> = TaskGroupLocalAmHandle<L>;
+
     //#[tracing::instrument(skip_all)]
     fn wait_all(&self) {
         self.wait_all();
@@ -473,28 +543,28 @@ impl ActiveMessaging for LamellarTaskGroup {
     }
 
     //#[tracing::instrument(skip_all)]
-    fn exec_am_all<F>(&self, am: F) -> Pin<Box<dyn Future<Output = Vec<F::Output>> + Send>>
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
         // trace!("[{:?}] team exec am all request", self.team.world_pe);
-        self.exec_am_all_inner(am).into_future()
+        self.exec_am_all_inner(am)
     }
 
     //#[tracing::instrument(skip_all)]
-    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
-        self.exec_am_pe_inner(pe, am).into_future()
+        self.exec_am_pe_inner(pe, am)
     }
 
     //#[tracing::instrument(skip_all)]
-    fn exec_am_local<F>(&self, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
-        self.exec_am_local_inner(am).into_future()
+        self.exec_am_local_inner(am)
     }
 
     fn block_on<F>(&self, f: F) -> F::Output
@@ -522,7 +592,7 @@ impl LamellarTaskGroup {
         let team = team.into().team.clone();
         let counters = AMCounters::new();
         let cnt = Arc::new(AtomicUsize::new(1)); //this lamellarTaskGroup instance represents 1 handle (even though we maintain a single and multi req handle)
-        let req = Arc::new(TaskGroupRequestHandleInner {
+        let req = Arc::new(TaskGroupAmHandleInner {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
             wakers: Mutex::new(HashMap::new()),
@@ -531,8 +601,8 @@ impl LamellarTaskGroup {
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
             scheduler: team.scheduler.clone(),
         });
-        let rt_req = Arc::new(LamellarRequestResult { req: req.clone() });
-        let multi_req = Arc::new(TaskGroupMultiRequestHandleInner {
+        let rt_req = Arc::new(LamellarRequestResult::TgAm(req.clone()));
+        let multi_req = Arc::new(TaskGroupMultiAmHandleInner {
             cnt: cnt.clone(),
             arch: team.arch.clone(),
             data: Mutex::new(HashMap::new()),
@@ -542,10 +612,8 @@ impl LamellarTaskGroup {
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
             scheduler: team.scheduler.clone(),
         });
-        let rt_multi_req = Arc::new(LamellarRequestResult {
-            req: multi_req.clone(),
-        });
-        let local_req = Arc::new(TaskGroupLocalRequestHandleInner {
+        let rt_multi_req = Arc::new(LamellarRequestResult::TgMultiAm(multi_req.clone()));
+        let local_req = Arc::new(TaskGroupAmHandleInner {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
             wakers: Mutex::new(HashMap::new()),
@@ -554,9 +622,7 @@ impl LamellarTaskGroup {
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
             scheduler: team.scheduler.clone(),
         });
-        let rt_local_req = Arc::new(LamellarRequestResult {
-            req: local_req.clone(),
-        });
+        let rt_local_req = Arc::new(LamellarRequestResult::TgAm(local_req.clone()));
         LamellarTaskGroup {
             team: team.clone(),
             id: Arc::as_ptr(&rt_req) as usize,
@@ -615,10 +681,7 @@ impl LamellarTaskGroup {
         }
     }
 
-    pub(crate) fn exec_am_all_inner<F>(
-        &self,
-        am: F,
-    ) -> Box<dyn LamellarMultiRequest<Output = F::Output>>
+    pub(crate) fn exec_am_all_inner<F>(&self, am: F) -> TaskGroupMultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
@@ -655,18 +718,14 @@ impl LamellarTaskGroup {
         };
         // println!("[{:?}] task group am all", std::thread::current().id());
         self.team.scheduler.submit_am(Am::All(req_data, func));
-        Box::new(TaskGroupMultiRequestHandle {
+        TaskGroupMultiAmHandle {
             inner: self.multi_req.clone(),
             sub_id: req_id.sub_id,
             _phantom: PhantomData,
-        })
+        }
     }
 
-    pub(crate) fn exec_am_pe_inner<F>(
-        &self,
-        pe: usize,
-        am: F,
-    ) -> Box<dyn LamellarRequest<Output = F::Output>>
+    pub(crate) fn exec_am_pe_inner<F>(&self, pe: usize, am: F) -> TaskGroupAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
@@ -699,17 +758,14 @@ impl LamellarTaskGroup {
         };
         // println!("[{:?}] task group am pe", std::thread::current().id());
         self.team.scheduler.submit_am(Am::Remote(req_data, func));
-        Box::new(TaskGroupRequestHandle {
+        TaskGroupAmHandle {
             inner: self.req.clone(),
             sub_id: req_id.sub_id,
             _phantom: PhantomData,
-        })
+        }
     }
 
-    pub(crate) fn exec_am_local_inner<F>(
-        &self,
-        am: F,
-    ) -> Box<dyn LamellarRequest<Output = F::Output>>
+    pub(crate) fn exec_am_local_inner<F>(&self, am: F) -> TaskGroupLocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
@@ -719,7 +775,7 @@ impl LamellarTaskGroup {
     pub(crate) fn exec_arc_am_local_inner<O: SyncSend + 'static>(
         &self,
         func: LamellarArcLocalAm,
-    ) -> Box<dyn LamellarRequest<Output = O>> {
+    ) -> TaskGroupLocalAmHandle<O> {
         // println!("task group exec am local");
         self.team.team_counters.add_send_req(1);
         self.team.world_counters.add_send_req(1);
@@ -748,11 +804,16 @@ impl LamellarTaskGroup {
         };
         // println!("[{:?}] task group am local", std::thread::current().id());
         self.team.scheduler.submit_am(Am::Local(req_data, func));
-        Box::new(TaskGroupLocalRequestHandle {
+        // Box::new(TaskGroupLocalAmHandle {
+        //     inner: self.local_req.clone(),
+        //     sub_id: req_id.sub_id,
+        //     _phantom: PhantomData,
+        // })
+        TaskGroupLocalAmHandle {
             inner: self.local_req.clone(),
             sub_id: req_id.sub_id,
             _phantom: PhantomData,
-        })
+        }
     }
 }
 
@@ -866,7 +927,7 @@ impl LamellarActiveMessage for AmGroupAm {
                         __lamellar_team.clone(),
                     )
                 })
-                .collect::<futures::stream::FuturesOrdered<_>>()
+                .collect::<futures_util::stream::FuturesOrdered<_>>()
                 .collect::<Vec<_>>()
                 .await;
             // for am in self.ams[self.si..self.ei].iter() {
@@ -1225,15 +1286,14 @@ impl AmGroup {
                         if *pe == self.team.num_pes {
                             reqs_all.push(
                                 self.team
-                                    .exec_arc_am_all::<Vec<Vec<u8>>>(Arc::new(tg_am), None)
-                                    .into_future(),
+                                    .exec_arc_am_all::<Vec<Vec<u8>>>(Arc::new(tg_am), None),
                             );
                         } else {
-                            reqs.push(
-                                self.team
-                                    .exec_arc_am_pe::<Vec<Vec<u8>>>(*pe, Arc::new(tg_am), None)
-                                    .into_future(),
-                            );
+                            reqs.push(self.team.exec_arc_am_pe::<Vec<Vec<u8>>>(
+                                *pe,
+                                Arc::new(tg_am),
+                                None,
+                            ));
                         }
                         send = false;
                         start_i = i;
@@ -1250,15 +1310,14 @@ impl AmGroup {
                     if *pe == self.team.num_pes {
                         reqs_all.push(
                             self.team
-                                .exec_arc_am_all::<Vec<Vec<u8>>>(Arc::new(tg_am), None)
-                                .into_future(),
+                                .exec_arc_am_all::<Vec<Vec<u8>>>(Arc::new(tg_am), None),
                         );
                     } else {
-                        reqs.push(
-                            self.team
-                                .exec_arc_am_pe::<Vec<Vec<u8>>>(*pe, Arc::new(tg_am), None)
-                                .into_future(),
-                        );
+                        reqs.push(self.team.exec_arc_am_pe::<Vec<Vec<u8>>>(
+                            *pe,
+                            Arc::new(tg_am),
+                            None,
+                        ));
                     }
                 }
             } else {
@@ -1271,14 +1330,12 @@ impl AmGroup {
                 if *pe == self.team.num_pes {
                     reqs_all.push(
                         self.team
-                            .exec_arc_am_all::<Vec<Vec<u8>>>(Arc::new(tg_am), None)
-                            .into_future(),
+                            .exec_arc_am_all::<Vec<Vec<u8>>>(Arc::new(tg_am), None),
                     );
                 } else {
                     reqs.push(
                         self.team
-                            .exec_arc_am_pe::<Vec<Vec<u8>>>(*pe, Arc::new(tg_am), None)
-                            .into_future(),
+                            .exec_arc_am_pe::<Vec<Vec<u8>>>(*pe, Arc::new(tg_am), None),
                     );
                 }
             }
@@ -1291,8 +1348,8 @@ impl AmGroup {
         //     reqs.len(),
         //     reqs_all.len()
         // );
-        futures::future::join_all(reqs).await;
-        futures::future::join_all(reqs_all).await;
+        futures_util::future::join_all(reqs).await;
+        futures_util::future::join_all(reqs_all).await;
         // if let Some(req) = all_req{
         //     req.await;
         // }
@@ -1474,16 +1531,16 @@ impl<'a, T> Iterator for TypedAmGroupResultIter<'a, T> {
 /// This enum is used to specify the type of AmGroup request
 pub enum BaseAmGroupReq<T> {
     /// This request will execute on a single PE  and return the unit value
-    SinglePeUnit(std::pin::Pin<Box<dyn std::future::Future<Output = T> + Send>>),
+    SinglePeUnit(AmHandle<T>),
     /// This request will return a single value of type T from a single PE
-    SinglePeVal(std::pin::Pin<Box<dyn std::future::Future<Output = Vec<T>> + Send>>),
+    SinglePeVal(AmHandle<Vec<T>>),
     /// This request will execute on all PEs and return a vec of unit values
-    AllPeUnit(std::pin::Pin<Box<dyn std::future::Future<Output = Vec<T>> + Send>>),
+    AllPeUnit(MultiAmHandle<T>),
     /// This request will execute on all PEs and return a vec of values of type T for each PE
-    AllPeVal(std::pin::Pin<Box<dyn std::future::Future<Output = Vec<Vec<T>>> + Send>>),
+    AllPeVal(MultiAmHandle<Vec<T>>),
 }
 
-impl<T> BaseAmGroupReq<T> {
+impl<T: AmDist> BaseAmGroupReq<T> {
     async fn into_result(self) -> BaseAmGroupResult<T> {
         match self {
             BaseAmGroupReq::SinglePeUnit(reqs) => BaseAmGroupResult::SinglePeUnit(reqs.await),
@@ -1523,7 +1580,7 @@ pub struct TypedAmGroupBatchResult<T> {
     reqs: BaseAmGroupResult<T>,
 }
 
-impl<T> TypedAmGroupBatchReq<T> {
+impl<T: AmDist> TypedAmGroupBatchReq<T> {
     /// Create a new TypedAmGroupBatchReq for PE with the assoicated IDs and individual Requests
     pub fn new(pe: usize, ids: Vec<usize>, reqs: BaseAmGroupReq<T>) -> Self {
         Self { pe, ids, reqs }
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 401e6f29..145ba091 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1,3 +1,4 @@
+use crate::active_messaging::handle::AmHandleInner;
 use crate::active_messaging::*;
 use crate::barrier::Barrier;
 use crate::lamellae::{AllocationType, Lamellae, LamellaeComm, LamellaeRDMA};
@@ -18,8 +19,8 @@ use std::collections::hash_map::DefaultHasher;
 use std::hash::{Hash, Hasher};
 // use std::any;
 use core::pin::Pin;
-use futures::Future;
-use parking_lot::{Condvar, Mutex, RwLock};
+use futures_util::Future;
+use parking_lot::{Mutex, RwLock};
 use std::collections::HashMap;
 use std::marker::PhantomPinned;
 use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
@@ -386,25 +387,21 @@ impl LamellarTeam {
     }
 
     #[doc(hidden)]
-    pub fn exec_am_group_pe<F, O>(
-        &self,
-        pe: usize,
-        am: F,
-    ) -> Pin<Box<dyn Future<Output = O> + Send>>
+    pub fn exec_am_group_pe<F, O>(&self, pe: usize, am: F) -> AmHandle<O>
     where
         F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
         O: AmDist + 'static,
     {
-        self.team.am_group_exec_am_pe_tg(pe, am, None).into_future()
+        self.team.am_group_exec_am_pe_tg(pe, am, None)
     }
 
     #[doc(hidden)]
-    pub fn exec_am_group_all<F, O>(&self, am: F) -> Pin<Box<dyn Future<Output = Vec<O>> + Send>>
+    pub fn exec_am_group_all<F, O>(&self, am: F) -> MultiAmHandle<O>
     where
         F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
         O: AmDist + 'static,
     {
-        self.team.am_group_exec_am_all_tg(am, None).into_future()
+        self.team.am_group_exec_am_all_tg(am, None)
     }
 }
 
@@ -446,35 +443,38 @@ impl std::fmt::Debug for LamellarTeam {
 }
 
 impl ActiveMessaging for Arc<LamellarTeam> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
     //#[tracing::instrument(skip_all)]
-    fn exec_am_all<F>(&self, am: F) -> Pin<Box<dyn Future<Output = Vec<F::Output>> + Send>>
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         // trace!("[{:?}] team exec am all request", self.team.world_pe);
-        self.team.exec_am_all_tg(am, None).into_future()
+        self.team.exec_am_all_tg(am, None)
     }
 
     //#[tracing::instrument(skip_all)]
-    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
-        self.team.exec_am_pe_tg(pe, am, None).into_future()
+        self.team.exec_am_pe_tg(pe, am, None)
     }
 
     //#[tracing::instrument(skip_all)]
-    fn exec_am_local<F>(&self, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_local<F>(&self, am: F) -> LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
-        self.team.exec_am_local_tg(am, None).into_future()
+        self.team.exec_am_local_tg(am, None)
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1361,10 +1361,7 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn exec_am_all<F>(
-        self: &Pin<Arc<LamellarTeamRT>>,
-        am: F,
-    ) -> Box<dyn LamellarMultiRequest<Output = F::Output>>
+    pub fn exec_am_all<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + AmDist,
     {
@@ -1376,7 +1373,7 @@ impl LamellarTeamRT {
         self: &Pin<Arc<LamellarTeamRT>>,
         am: F,
         task_group_cnts: Option<Arc<AMCounters>>,
-    ) -> Box<dyn LamellarMultiRequest<Output = F::Output>>
+    ) -> MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
     {
@@ -1390,7 +1387,7 @@ impl LamellarTeamRT {
             }
             None => None,
         };
-        let req = Arc::new(LamellarMultiRequestHandleInner {
+        let req = Arc::new(MultiAmHandleInner {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             data: Mutex::new(HashMap::new()),
@@ -1398,10 +1395,10 @@ impl LamellarTeamRT {
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-            user_handle: AtomicBool::new(true),
+            user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
-        let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
+        let req_result = Arc::new(LamellarRequestResult::MultiAm(req.clone()));
         let req_ptr = Arc::into_raw(req_result);
         for _ in 0..(self.num_pes - 1) {
             // -1 because of the arc we turned into raw
@@ -1436,10 +1433,10 @@ impl LamellarTeamRT {
         // event!(Level::TRACE, "submitting request to scheduler");
         // println!("[{:?}] team exec all", std::thread::current().id());
         self.scheduler.submit_am(Am::All(req_data, func));
-        Box::new(LamellarMultiRequestHandle {
+        MultiAmHandle {
             inner: req,
             _phantom: PhantomData,
-        })
+        }
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1447,7 +1444,7 @@ impl LamellarTeamRT {
         self: &Pin<Arc<LamellarTeamRT>>,
         am: F,
         task_group_cnts: Option<Arc<AMCounters>>,
-    ) -> Box<dyn LamellarMultiRequest<Output = O>>
+    ) -> MultiAmHandle<O>
     where
         F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
         O: AmDist + 'static,
@@ -1462,7 +1459,7 @@ impl LamellarTeamRT {
             }
             None => None,
         };
-        let req = Arc::new(LamellarMultiRequestHandleInner {
+        let req = Arc::new(MultiAmHandleInner {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             data: Mutex::new(HashMap::new()),
@@ -1470,10 +1467,10 @@ impl LamellarTeamRT {
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-            user_handle: AtomicBool::new(true),
+            user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
-        let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
+        let req_result = Arc::new(LamellarRequestResult::MultiAm(req.clone()));
         let req_ptr = Arc::into_raw(req_result);
         for _ in 0..(self.num_pes - 1) {
             // -1 because of the arc we turned into raw
@@ -1508,18 +1505,14 @@ impl LamellarTeamRT {
         // event!(Level::TRACE, "submitting request to scheduler");
         // println!("[{:?}] team am group exec all", std::thread::current().id());
         self.scheduler.submit_am(Am::All(req_data, func));
-        Box::new(LamellarMultiRequestHandle {
+        MultiAmHandle {
             inner: req,
             _phantom: PhantomData,
-        })
+        }
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn exec_am_pe<F>(
-        self: &Pin<Arc<LamellarTeamRT>>,
-        pe: usize,
-        am: F,
-    ) -> Box<dyn LamellarRequest<Output = F::Output>>
+    pub fn exec_am_pe<F>(self: &Pin<Arc<LamellarTeamRT>>, pe: usize, am: F) -> AmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + AmDist,
     {
@@ -1532,7 +1525,7 @@ impl LamellarTeamRT {
         pe: usize,
         am: F,
         task_group_cnts: Option<Arc<AMCounters>>,
-    ) -> Box<dyn LamellarRequest<Output = F::Output>>
+    ) -> AmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
     {
@@ -1546,17 +1539,17 @@ impl LamellarTeamRT {
         };
         assert!(pe < self.arch.num_pes());
 
-        let req = Arc::new(LamellarRequestHandleInner {
+        let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-            user_handle: AtomicBool::new(true),
+            user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
-        let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
+        let req_result = Arc::new(LamellarRequestResult::Am(req.clone()));
         let req_ptr = Arc::into_raw(req_result);
         // Arc::increment_strong_count(req_ptr); //we would need to do this for the exec_all command
         let id = ReqId {
@@ -1593,10 +1586,15 @@ impl LamellarTeamRT {
         // println!("[{:?}] team exec am pe tg", std::thread::current().id());
         self.scheduler.submit_am(Am::Remote(req_data, func));
 
-        Box::new(LamellarRequestHandle {
+        // Box::new(LamellarRequestHandle {
+        //     inner: req,
+        //     _phantom: PhantomData,
+        // })
+        AmHandle {
             inner: req,
             _phantom: PhantomData,
-        })
+        }
+        .into()
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1605,7 +1603,7 @@ impl LamellarTeamRT {
         pe: usize,
         am: F,
         task_group_cnts: Option<Arc<AMCounters>>,
-    ) -> Box<dyn LamellarRequest<Output = O>>
+    ) -> AmHandle<O>
     where
         F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
         O: AmDist + 'static,
@@ -1620,17 +1618,17 @@ impl LamellarTeamRT {
         };
         assert!(pe < self.arch.num_pes());
 
-        let req = Arc::new(LamellarRequestHandleInner {
+        let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-            user_handle: AtomicBool::new(true),
+            user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
-        let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
+        let req_result = Arc::new(LamellarRequestResult::Am(req.clone()));
         let req_ptr = Arc::into_raw(req_result);
         // Arc::increment_strong_count(req_ptr); //we would need to do this for the exec_all command
         let id = ReqId {
@@ -1670,10 +1668,14 @@ impl LamellarTeamRT {
         // );
         self.scheduler.submit_am(Am::Remote(req_data, func));
 
-        Box::new(LamellarRequestHandle {
+        // Box::new(LamellarRequestHandle {
+        //     inner: req,
+        //     _phantom: PhantomData,
+        // })
+        AmHandle {
             inner: req,
             _phantom: PhantomData,
-        })
+        }
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1681,7 +1683,7 @@ impl LamellarTeamRT {
         self: &Pin<Arc<LamellarTeamRT>>,
         am: LamellarArcAm,
         task_group_cnts: Option<Arc<AMCounters>>,
-    ) -> Box<dyn LamellarMultiRequest<Output = F>>
+    ) -> MultiAmHandle<F>
     where
         F: AmDist,
     {
@@ -1693,7 +1695,7 @@ impl LamellarTeamRT {
             }
             None => None,
         };
-        let req = Arc::new(LamellarMultiRequestHandleInner {
+        let req = Arc::new(MultiAmHandleInner {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             waker: Mutex::new(None),
@@ -1701,10 +1703,10 @@ impl LamellarTeamRT {
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-            user_handle: AtomicBool::new(true),
+            user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
-        let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
+        let req_result = Arc::new(LamellarRequestResult::MultiAm(req.clone()));
         let req_ptr = Arc::into_raw(req_result);
         for _ in 0..(self.num_pes - 1) {
             // -1 because of the arc we turned into raw
@@ -1739,10 +1741,10 @@ impl LamellarTeamRT {
         // );
         self.scheduler.submit_am(Am::All(req_data, am));
 
-        Box::new(LamellarMultiRequestHandle {
+        MultiAmHandle {
             inner: req,
             _phantom: PhantomData,
-        })
+        }
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1751,7 +1753,7 @@ impl LamellarTeamRT {
         pe: usize,
         am: LamellarArcAm,
         task_group_cnts: Option<Arc<AMCounters>>,
-    ) -> Box<dyn LamellarRequest<Output = F>>
+    ) -> AmHandle<F>
     where
         F: AmDist,
     {
@@ -1764,17 +1766,17 @@ impl LamellarTeamRT {
             None => None,
         };
         assert!(pe < self.arch.num_pes());
-        let req = Arc::new(LamellarRequestHandleInner {
+        let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-            user_handle: AtomicBool::new(true),
+            user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
-        let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
+        let req_result = Arc::new(LamellarRequestResult::Am(req.clone()));
         let req_ptr = Arc::into_raw(req_result);
         let id = ReqId {
             id: req_ptr as usize,
@@ -1802,10 +1804,15 @@ impl LamellarTeamRT {
         // println!("[{:?}] team arc exec am pe", std::thread::current().id());
         self.scheduler.submit_am(Am::Remote(req_data, am));
 
-        Box::new(LamellarRequestHandle {
+        // Box::new(LamellarRequestHandle {
+        //     inner: req,
+        //     _phantom: PhantomData,
+        // })
+        AmHandle {
             inner: req,
             _phantom: PhantomData,
-        })
+        }
+        .into()
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1833,7 +1840,7 @@ impl LamellarTeamRT {
     //         team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
     //         world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
     //         tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-    //         user_handle: AtomicBool::new(true),
+    //         user_handle: AtomicU8::new(1),
     //         scheduler: self.scheduler.clone(),
     //     });
     //     let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
@@ -1874,10 +1881,7 @@ impl LamellarTeamRT {
     // }
 
     //#[tracing::instrument(skip_all)]
-    pub fn exec_am_local<F>(
-        self: &Pin<Arc<LamellarTeamRT>>,
-        am: F,
-    ) -> Box<dyn LamellarRequest<Output = F::Output>>
+    pub fn exec_am_local<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
@@ -1889,7 +1893,7 @@ impl LamellarTeamRT {
         self: &Pin<Arc<LamellarTeamRT>>,
         am: F,
         task_group_cnts: Option<Arc<AMCounters>>,
-    ) -> Box<dyn LamellarRequest<Output = F::Output>>
+    ) -> LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
@@ -1901,17 +1905,17 @@ impl LamellarTeamRT {
             }
             None => None,
         };
-        let req = Arc::new(LamellarLocalRequestHandleInner {
-            ready: (Mutex::new(false), Condvar::new()),
+        let req = Arc::new(AmHandleInner {
+            ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
             team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
             world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-            user_handle: AtomicBool::new(true),
+            user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
-        let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
+        let req_result = Arc::new(LamellarRequestResult::Am(req.clone()));
         let req_ptr = Arc::into_raw(req_result);
         let id = ReqId {
             id: req_ptr as usize,
@@ -1941,10 +1945,14 @@ impl LamellarTeamRT {
         // println!("[{:?}] team exec am local", std::thread::current().id());
         self.scheduler.submit_am(Am::Local(req_data, func));
 
-        Box::new(LamellarLocalRequestHandle {
+        // Box::new(LamellarLocalRequestHandle {
+        //     inner: req,
+        //     _phantom: PhantomData,
+        // })
+        LocalAmHandle {
             inner: req,
             _phantom: PhantomData,
-        })
+        }
     }
     /// allocate a shared memory region from the asymmetric heap
     ///
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index eec53b60..2ac4f6ad 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -11,11 +11,10 @@ use crate::scheduler::{create_scheduler, ExecutorType};
 
 //use tracing::*;
 
-use futures::Future;
+use futures_util::Future;
 use parking_lot::RwLock;
 use pin_weak::sync::PinWeak;
 use std::collections::HashMap;
-use std::pin::Pin;
 use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 
@@ -44,15 +43,18 @@ pub struct LamellarWorld {
 }
 
 impl ActiveMessaging for LamellarWorld {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
     //#[tracing::instrument(skip_all)]
-    fn exec_am_all<F>(&self, am: F) -> Pin<Box<dyn Future<Output = Vec<F::Output>> + Send>>
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
         self.team.exec_am_all(am)
     }
     //#[tracing::instrument(skip_all)]
-    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
@@ -60,7 +62,7 @@ impl ActiveMessaging for LamellarWorld {
         self.team.exec_am_pe(pe, am)
     }
     //#[tracing::instrument(skip_all)]
-    fn exec_am_local<F>(&self, am: F) -> Pin<Box<dyn Future<Output = F::Output> + Send>>
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
diff --git a/src/lib.rs b/src/lib.rs
index 099e09e5..088c1758 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -253,7 +253,7 @@ pub use parking_lot;
 pub use async_trait;
 
 #[doc(hidden)]
-pub use futures;
+pub use futures_util;
 
 pub mod active_messaging;
 #[doc(hidden)]
@@ -285,7 +285,6 @@ pub use utils::*;
 pub use crate::lamellae::Backend;
 pub use crate::lamellar_arch::{BlockedArch, IdError, LamellarArch, StridedArch};
 #[doc(hidden)]
-pub use crate::lamellar_request::LamellarRequest;
 pub use crate::lamellar_task_group::{
     AmGroup, AmGroupResult, BaseAmGroupReq, LamellarTaskGroup, TypedAmGroupBatchReq,
     TypedAmGroupBatchResult, TypedAmGroupResult,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 5758d315..82f44cf6 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -6,7 +6,7 @@ use crate::active_messaging::*;
 use crate::lamellae::{Des, Lamellae, SerializedData};
 
 use enum_dispatch::enum_dispatch;
-use futures::Future;
+use futures_util::Future;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index 814e57db..de44e311 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -2,7 +2,7 @@ use crate::scheduler::LamellarExecutor;
 
 use tokio::runtime::Runtime;
 
-use futures::Future;
+use futures_util::Future;
 
 #[derive(Debug)]
 pub(crate) struct TokioRt {
@@ -17,7 +17,7 @@ impl LamellarExecutor for TokioRt {
         F::Output: Send,
     {
         // trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
+        self.rt.spawn(async move { task.await });
         // });
     }
 
@@ -27,13 +27,13 @@ impl LamellarExecutor for TokioRt {
         F::Output: Send,
     {
         // trace_span!("submit_task").in_scope(|| {
-            self.rt.spawn(async move { task.await });
+        self.rt.spawn(async move { task.await });
         // });
     }
 
     fn block_on<F: Future>(&self, task: F) -> F::Output {
-        // trace_span!("block_on").in_scope(|| 
-            self.rt.block_on(task)
+        // trace_span!("block_on").in_scope(||
+        self.rt.block_on(task)
         // )
     }
 
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index a5a6738c..b53651d9 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -5,13 +5,16 @@ use crate::scheduler::{LamellarExecutor, SchedulerStatus};
 use async_task::{Builder, Runnable};
 use core_affinity::CoreId;
 use crossbeam::deque::Worker;
-use futures::Future;
-use futures_lite::FutureExt;
+use futures_util::Future;
 use rand::prelude::*;
 use std::panic;
+use std::pin::Pin;
 use std::process;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
-use std::sync::Arc; //, Weak};
+use std::sync::Arc;
+use std::task::Context;
+use std::task::Poll;
+//, Weak};
 use std::thread;
 
 static TASK_ID: AtomicUsize = AtomicUsize::new(0);
@@ -160,22 +163,22 @@ impl LamellarExecutor for WorkStealing {
         // });
     }
 
-    fn block_on<F: Future>(&self, task: F) -> F::Output {
+    fn block_on<F: Future>(&self, fut: F) -> F::Output {
         // trace_span!("block_on").in_scope(|| {
         let work_inj = self.work_inj.clone();
         let schedule = move |runnable| work_inj.push(runnable);
         let (runnable, mut task) = unsafe {
             Builder::new()
                 .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn_unchecked(move |_task_id| async move { task.await }, schedule)
+                .spawn_unchecked(move |_task_id| async move { fut.await }, schedule)
         };
         let waker = runnable.waker();
         runnable.run(); //try to run immediately
         while !task.is_finished() {
             self.exec_task(); //try to execute another task while this one is not ready
         }
-        let cx = &mut async_std::task::Context::from_waker(&waker);
-        if let async_std::task::Poll::Ready(output) = task.poll(cx) {
+        let cx = &mut Context::from_waker(&waker);
+        if let Poll::Ready(output) = Pin::new(&mut task).poll(cx) {
             output
         } else {
             println!(
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 9f73175a..a7e58549 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -100,7 +100,7 @@ macro_rules! add_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("full_0 {:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("full_0 {:?} {:?} {:?}",i,val,max_val);
                 }
             }
             if !success{
@@ -128,7 +128,7 @@ macro_rules! add_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("full_1 {:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("full_1 {:?} {:?} {:?}",i,val,max_val);
                 }
             }
             if !success{
@@ -164,7 +164,7 @@ macro_rules! add_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("half_0 {:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("half_0 {:?} {:?} {:?}",i,val,max_val);
                 }
             }
             array.wait_all();
@@ -189,7 +189,7 @@ macro_rules! add_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("half_1 {:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("half_1 {:?} {:?} {:?}",i,val,max_val);
                 }
             }
             if !success{
@@ -226,7 +226,7 @@ macro_rules! add_test{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
-                        println!("small_0 {:?} {:?} {:?}",i,val,max_val);
+                        eprintln!("small_0 {:?} {:?} {:?}",i,val,max_val);
                     }
                 }
                 array.wait_all();
@@ -251,7 +251,7 @@ macro_rules! add_test{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
-                        println!("small_1 {:?} {:?} {:?}",i,val,max_val);
+                       eprintln!("small_1 {:?} {:?} {:?}",i,val,max_val);
                     }
                 }
                 if !success{
@@ -287,11 +287,12 @@ macro_rules! check_results {
             let val = *elem;
             check_val!($array_ty, val, $num_pes, success);
             if !success {
-                println!("input {:?}: {:?} {:?} {:?}", $test, i, val, $num_pes);
+                eprintln!("input {:?}: {:?} {:?} {:?}", $test, i, val, $num_pes);
             }
         }
         if !success {
-            $array.print();
+            eprintln!("failed test {:?}", $test);
+            // $array.print();
         }
         $array.barrier();
         let init_val = 0;
@@ -317,10 +318,16 @@ macro_rules! input_test{
             #[allow(unused_unsafe)]
             unsafe {
                 if $dist == lamellar::array::Distribution::Block{
-                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {println!("i: {:?}",i);*x = i%array_total_len});
+                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {
+                        // println!("i: {:?}",i);
+                        *x = i%array_total_len}
+                    );
                 }
                 else{
-                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {println!("i: {:?}",i);*x = i/num_pes});
+                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {
+                        //println!("i: {:?}",i);
+                        *x = i/num_pes}
+                    );
                 }
             }
             input_array.wait_all();
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index 47948887..04968867 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -90,7 +90,7 @@ macro_rules! div_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("full {:?} {:?} {:?}",i,val,one);
+                   eprintln!("full {:?} {:?} {:?}",i,val,one);
                 }
             }
 
@@ -116,7 +116,7 @@ macro_rules! div_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("half {:?} {:?} {:?}",i,val,one);
+                    eprintln!("half {:?} {:?} {:?}",i,val,one);
                 }
             }
             sub_array.barrier();
@@ -142,7 +142,7 @@ macro_rules! div_test{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
-                        println!("pe {:?} {:?} {:?}",i,val,one);
+                        eprintln!("pe {:?} {:?} {:?}",i,val,one);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 80ce4761..5c3edf99 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -4,6 +4,8 @@ use lamellar::memregion::prelude::*;
 use rand::distributions::Distribution;
 use rand::distributions::Uniform;
 
+use std::ops::Deref;
+
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
@@ -11,10 +13,10 @@ macro_rules! initialize_array {
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().enumerate().for_each(move |(_i, x)| {
-            // println!("{:?} {:?}", i, x.load());
-            x.store($init_val)
-        });
+        let _ = $array
+            .dist_iter()
+            .enumerate()
+            .for_each(move |(_i, x)| x.store($init_val));
         $array.wait_all();
         $array.barrier();
     };
@@ -110,7 +112,7 @@ macro_rules! fetch_add_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("full 1: {:?} {:?}",val,prevs);
+                        eprintln!("full 1: {:?} {:?}",val,prevs);
                         success = false;
                     }
                 }
@@ -122,7 +124,7 @@ macro_rules! fetch_add_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("full 2: {:?} {:?} {:?}",i,val,max_val);
+                   eprintln!("full 2: {:?} {:?} {:?}",i,val,max_val);
                 }
             }
             array.barrier();
@@ -146,7 +148,7 @@ macro_rules! fetch_add_test{
             let tot_updates = num_updates * num_pes;
             check_val!($array,sum,tot_updates,success);
             if !success{
-                println!("full 4: {:?} {:?}",sum,tot_updates);
+                eprintln!("full 4: {:?} {:?}",sum,tot_updates);
             }
             world.wait_all();
             world.barrier();
@@ -171,7 +173,7 @@ macro_rules! fetch_add_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("half 1: {:?} {:?}",val,prevs);
+                        eprintln!("half 1: {:?} {:?}",val,prevs);
                         success = false;
                     }
                 }
@@ -183,7 +185,7 @@ macro_rules! fetch_add_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("half 2: {:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("half 2: {:?} {:?} {:?}",i,val,max_val);
                 }
             }
             array.barrier();
@@ -206,7 +208,7 @@ macro_rules! fetch_add_test{
             let tot_updates = num_updates * num_pes;
             check_val!($array,sum,tot_updates,success);
             if !success{
-                println!("half 4: {:?} {:?}",sum,tot_updates);
+                eprintln!("half 4: {:?} {:?}",sum,tot_updates);
             }
             array.wait_all();
             array.barrier();
@@ -234,7 +236,7 @@ macro_rules! fetch_add_test{
                     for req in reqs{
                         let val =  world.block_on(req) as u128;
                         if ! insert_prev!($array,val,prevs){
-                            println!("pe 1: {:?} {:?}",val,prevs);
+                            eprintln!("pe 1: {:?} {:?}",val,prevs);
                             success = false;
                         }
                     }
@@ -246,7 +248,7 @@ macro_rules! fetch_add_test{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
-                        println!("pe 2 {:?} {:?} {:?}",i,val,max_val);
+                        eprintln!("pe 2 {:?} {:?} {:?}",i,val,max_val);
                     }
                 }
                 sub_array.barrier();
@@ -269,7 +271,7 @@ macro_rules! fetch_add_test{
                 let tot_updates = num_updates * num_pes;
                 check_val!($array,sum,tot_updates,success);
                 if !success{
-                    println!("pe 4 {:?} {:?}",sum,tot_updates);
+                    eprintln!("pe 4 {:?} {:?}",sum,tot_updates);
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
@@ -345,19 +347,18 @@ macro_rules! check_results {
                 else{
                     $num_pes + $real_val
                 };
-                // println!("return i: {:?} j: {:?} check_val: {:?} val: {:?}, real_val: {:?}", i, j, check_val, res, $real_val);
 
                 if !(res >= &0 && res < &check_val) {
                     success = false;
-                    println!("return i: {:?} j: {:?} check_val: {:?} val: {:?}, real_val: {:?}", i, j, check_val, res, $real_val);
-                    break;
+                    eprintln!("return i: {:?} j: {:?} check_val: {:?} val: {:?}, real_val: {:?}", i, j, check_val, res, $real_val);
+                    // break;
                 }
                 req_cnt+=1;
             }
         }
         // println!("here");
         #[allow(unused_unsafe)]
-        for (i, elem) in unsafe { $array.onesided_iter().into_iter().enumerate() }{
+        for (i, elem) in unsafe { $array.buffered_onesided_iter($array.len()).into_iter().enumerate() }{
             let val = *elem;
             let real_val = if $real_val == 0  {
                 i + $num_pes
@@ -374,14 +375,11 @@ macro_rules! check_results {
             // println!("val {:?} real_val {:?}", val, real_val);
             check_val!($array_ty, real_val, val, success);
             if !success {
-                println!("input {:?}: {:?} {:?} {:?}", $test, i, val, real_val);
+                eprintln!("input {:?}: {:?} {:?} {:?}", $test, i, val, real_val);
                 break;
             }
         }
         // println!("here2");
-        if !success {
-            $array.print();
-        }
         $array.barrier();
         // let init_val = 0;
         initialize_array2!($array_ty, $array, init_val);
@@ -515,6 +513,7 @@ macro_rules! input_test{
             // ReadOnlyArray<T>------------------------------
             // let mut reqs = vec![];
             let input_array = input_array.into_read_only();
+            // println!("read only array len: {:?}", input_array.len());
             // reqs.push(array.fetch_add(input_array.clone(),1));
             // check_results!($array,array,num_pes,reqs,"ReadOnlyArray<T>");
             // ReadOnlyArray<T>------------------------------
@@ -525,6 +524,7 @@ macro_rules! input_test{
             // AtomicArray<T>------------------------------
             // let mut reqs = vec![];
             let input_array = input_array.into_atomic();
+            // println!("atomic array len: {:?}", input_array.len());
             // reqs.push(array.fetch_add(input_array.clone(),1));
             // check_results!($array,array,num_pes,reqs,"AtomicArray<T>");
             // AtomicArray<T>------------------------------
@@ -535,16 +535,21 @@ macro_rules! input_test{
             // LocalLockArray<T>------------------------------
             //  let mut reqs = vec![];
             let input_array = input_array.into_local_lock();
+            //  println!("local lock array len: {:?}", input_array.len());
             //  reqs.push(array.fetch_add(input_array.clone(),1));
             //  check_results!($array,array,num_pes,reqs,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
+            let local_data = input_array.blocking_read_local_data();
+            // println!("local lock array len: {:?}", local_data.deref());
+            reqs.push(array.batch_fetch_add(&local_data,1));
+            drop(local_data);
             check_results!($array,array,num_pes,reqs,"&LocalLockArray<T>");
 
             // GlobalLockArray<T>------------------------------
             //  let mut reqs = vec![];
             let input_array = input_array.into_global_lock();
+            // println!("global lock array len: {:?}", input_array.len());
             //  reqs.push(array.fetch_add(input_array.clone(),1));
             //  check_results!($array,array,num_pes,reqs,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index 855c5072..bf3e379a 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -113,7 +113,7 @@ macro_rules! fetch_div_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
+                        eprintln!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
                         success = false;
                         break;
                     }
@@ -127,7 +127,7 @@ macro_rules! fetch_div_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,one);
+                    eprintln!("{:?} {:?} {:?}",i,val,one);
                     break;
                 }
             }
@@ -152,7 +152,7 @@ macro_rules! fetch_div_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("half 1: {:?} {:?}",val,prevs);
+                        eprintln!("half 1: {:?} {:?}",val,prevs);
                         success = false;
                         break;
                     }
@@ -165,7 +165,7 @@ macro_rules! fetch_div_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,one);
+                    eprintln!("{:?} {:?} {:?}",i,val,one);
                     break;
                 }
             }
@@ -190,7 +190,7 @@ macro_rules! fetch_div_test{
                     for req in reqs{
                         let val =  world.block_on(req) as u128;
                         if ! insert_prev!($array,val,prevs){
-                            println!("pe 1: {:?} {:?}",val,prevs);
+                            eprintln!("pe 1: {:?} {:?}",val,prevs);
                             success = false;
                             break;
                         }
@@ -203,7 +203,7 @@ macro_rules! fetch_div_test{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",i,val,one);
+                        eprintln!("{:?} {:?} {:?}",i,val,one);
                         break;
                     }
                 }
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index 94bc1c55..c789d97c 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -109,7 +109,7 @@ macro_rules! fetch_mul_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
+                        eprintln!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
                         success = false;
                         break;
                     }
@@ -123,7 +123,7 @@ macro_rules! fetch_mul_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
 
@@ -147,7 +147,7 @@ macro_rules! fetch_mul_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("half 1: {:?} {:?}",val,prevs);
+                        eprintln!("half 1: {:?} {:?}",val,prevs);
                         success = false;
                         break;
                     }
@@ -160,7 +160,7 @@ macro_rules! fetch_mul_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
             sub_array.barrier();
@@ -183,7 +183,7 @@ macro_rules! fetch_mul_test{
                     for req in reqs{
                         let val =  world.block_on(req) as u128;
                         if ! insert_prev!($array,val,prevs){
-                            println!("pe 1: {:?} {:?}",val,prevs);
+                            eprintln!("pe 1: {:?} {:?}",val,prevs);
                             success = false;
                             break;
                         }
@@ -196,7 +196,7 @@ macro_rules! fetch_mul_test{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",i,val,max_val);
+                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/arithmetic_ops/fetch_rem_test.rs b/tests/array/arithmetic_ops/fetch_rem_test.rs
index f66c43e4..32d6d4c1 100644
--- a/tests/array/arithmetic_ops/fetch_rem_test.rs
+++ b/tests/array/arithmetic_ops/fetch_rem_test.rs
@@ -113,7 +113,7 @@ macro_rules! fetch_rem_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
+                        eprintln!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
                         success = false;
                         break;
                     }
@@ -127,7 +127,7 @@ macro_rules! fetch_rem_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,one);
+                    eprintln!("{:?} {:?} {:?}",i,val,one);
                     break;
                 }
             }
@@ -152,7 +152,7 @@ macro_rules! fetch_rem_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("half 1: {:?} {:?}",val,prevs);
+                        eprintln!("half 1: {:?} {:?}",val,prevs);
                         success = false;
                         break;
                     }
@@ -165,7 +165,7 @@ macro_rules! fetch_rem_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,one);
+                    eprintln!("{:?} {:?} {:?}",i,val,one);
                     break;
                 }
             }
@@ -190,7 +190,7 @@ macro_rules! fetch_rem_test{
                     for req in reqs{
                         let val =  world.block_on(req) as u128;
                         if ! insert_prev!($array,val,prevs){
-                            println!("pe 1: {:?} {:?}",val,prevs);
+                            eprintln!("pe 1: {:?} {:?}",val,prevs);
                             success = false;
                             break;
                         }
@@ -203,7 +203,7 @@ macro_rules! fetch_rem_test{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",i,val,one);
+                        eprintln!("{:?} {:?} {:?}",i,val,one);
                         break;
                     }
                 }
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index a4d7f340..a1615ce1 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -109,7 +109,7 @@ macro_rules! fetch_sub_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
+                        eprintln!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
                         success = false;
                         break;
                     }
@@ -121,7 +121,7 @@ macro_rules! fetch_sub_test{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
             array.barrier();
@@ -148,7 +148,7 @@ macro_rules! fetch_sub_test{
             let calced_sum = tot_updates as usize  * (array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
-                println!("{:?} {:?} {:?}",sum,calced_sum,(array.len()-1));
+                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(array.len()-1));
             }
             world.wait_all();
             world.barrier();
@@ -172,7 +172,7 @@ macro_rules! fetch_sub_test{
                 for req in reqs{
                     let val =  world.block_on(req) as u128;
                     if ! insert_prev!($array,val,prevs){
-                        println!("half 1: {:?} {:?}",val,prevs);
+                        eprintln!("half 1: {:?} {:?}",val,prevs);
                         success = false;
                         break;
                     }
@@ -184,7 +184,7 @@ macro_rules! fetch_sub_test{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
             sub_array.barrier();
@@ -209,7 +209,7 @@ macro_rules! fetch_sub_test{
             let calced_sum = tot_updates as usize  * (sub_array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
-                println!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
+                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -234,7 +234,7 @@ macro_rules! fetch_sub_test{
                     for req in reqs{
                         let val =  world.block_on(req) as u128;
                         if ! insert_prev!($array,val,prevs){
-                            println!("pe 1: {:?} {:?}",val,prevs);
+                            eprintln!("pe 1: {:?} {:?}",val,prevs);
                             success = false;
                             break;
                         }
@@ -246,7 +246,7 @@ macro_rules! fetch_sub_test{
                     let val = *elem;
                     check_val!($array,val,zero,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",i,val,max_val);
+                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
                     }
                 }
                 sub_array.barrier();
@@ -271,7 +271,7 @@ macro_rules! fetch_sub_test{
                 let calced_sum = tot_updates as usize  * (sub_array.len()-1);
                 check_val!($array,sum,calced_sum,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
+                    eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index 690861aa..1f62f176 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -97,7 +97,7 @@ macro_rules! mul_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
 
@@ -123,7 +123,7 @@ macro_rules! mul_test{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
             sub_array.barrier();
@@ -149,7 +149,7 @@ macro_rules! mul_test{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",i,val,max_val);
+                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/arithmetic_ops/rem_test.rs b/tests/array/arithmetic_ops/rem_test.rs
index 4095cb80..62faabc8 100644
--- a/tests/array/arithmetic_ops/rem_test.rs
+++ b/tests/array/arithmetic_ops/rem_test.rs
@@ -90,7 +90,7 @@ macro_rules! rem_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("full {:?} {:?} {:?}",i,val,one);
+                    eprintln!("full {:?} {:?} {:?}",i,val,one);
                 }
             }
 
@@ -116,7 +116,7 @@ macro_rules! rem_test{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
-                    println!("half {:?} {:?} {:?}",i,val,one);
+                    eprintln!("half {:?} {:?} {:?}",i,val,one);
                 }
             }
             sub_array.barrier();
@@ -142,7 +142,7 @@ macro_rules! rem_test{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
-                        println!("pe {:?} {:?} {:?}",i,val,one);
+                        eprintln!("pe {:?} {:?} {:?}",i,val,one);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index 2ce5405f..8bb3b8bc 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -95,7 +95,7 @@ macro_rules! sub_test{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
             array.barrier();
@@ -116,7 +116,7 @@ macro_rules! sub_test{
             let calced_sum = tot_updates as usize  * (array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
-                println!("{:?} {:?} {:?}",sum,calced_sum,(array.len()-1));
+                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(array.len()-1));
             }
             world.wait_all();
             world.barrier();
@@ -143,7 +143,7 @@ macro_rules! sub_test{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",i,val,max_val);
+                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
                 }
             }
             sub_array.barrier();
@@ -164,7 +164,7 @@ macro_rules! sub_test{
             let calced_sum = tot_updates as usize  * (sub_array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
-                println!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
+                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -191,7 +191,7 @@ macro_rules! sub_test{
                     let val = *elem;
                     check_val!($array,val,zero,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",i,val,max_val);
+                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
                     }
                 }
                 sub_array.barrier();
@@ -212,7 +212,7 @@ macro_rules! sub_test{
                 let calced_sum = tot_updates as usize  * (sub_array.len()-1);
                 check_val!($array,sum,calced_sum,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
+                    eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/atomic_ops/compare_exchange_test.rs b/tests/array/atomic_ops/compare_exchange_test.rs
index c991aa4c..8c5ad445 100644
--- a/tests/array/atomic_ops/compare_exchange_test.rs
+++ b/tests/array/atomic_ops/compare_exchange_test.rs
@@ -74,7 +74,7 @@ macro_rules! compare_exchange_test{
                     Ok(val) => {
                         check_val!($array,val,init_val,success);
                         if !success{
-                            println!("{:?} {:?} {:?}",idx,val,init_val);
+                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                         }
                     }
                     Err(val) => {
@@ -121,7 +121,7 @@ macro_rules! compare_exchange_test{
                     Ok(val) => {
                         check_val!($array,val,init_val,success);
                         if !success{
-                            println!("{:?} {:?} {:?}",idx,val,init_val);
+                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                         }
                     }
                     Err(val) => {
@@ -170,7 +170,7 @@ macro_rules! compare_exchange_test{
                         Ok(val) => {
                             check_val!($array,val,init_val,success);
                             if !success{
-                                println!("{:?} {:?} {:?}",idx,val,init_val);
+                                eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                             }
                         }
                         Err(val) => {
@@ -237,7 +237,7 @@ macro_rules! compare_exchange_epsilon_test{
                     Ok(val) => {
                         check_val!($array,val,init_val,success);
                         if !success{
-                            println!("{:?} {:?} {:?}",idx,val,init_val);
+                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                         }
                     }
                     Err(val) => {
@@ -284,7 +284,7 @@ macro_rules! compare_exchange_epsilon_test{
                     Ok(val) => {
                         check_val!($array,val,init_val,success);
                         if !success{
-                            println!("{:?} {:?} {:?}",idx,val,init_val);
+                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                         }
                     }
                     Err(val) => {
@@ -333,7 +333,7 @@ macro_rules! compare_exchange_epsilon_test{
                         Ok(val) => {
                             check_val!($array,val,init_val,success);
                             if !success{
-                                println!("{:?} {:?} {:?}",idx,val,init_val);
+                                eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                             }
                         }
                         Err(val) => {
@@ -386,11 +386,11 @@ macro_rules! check_input {
         for (i, r) in res.drain(..).enumerate() {
             if i % $num_pes == $my_pe {
                 if let Err(val) = r {
-                    println!("error i: {i} val: {val:?}");
+                    eprintln!("error i: {i} val: {val:?}");
                 }
             } else {
                 match r {
-                    Ok(val) => println!("error i: {i} val: {val:?}"),
+                    Ok(val) => println!("ok i: {i} val: {val:?}"),
                     Err(val) => {
                         if val != i {
                             println!("error i: {i} val: {val:?}");
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index 0dd5150d..1bdbfd9c 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -79,7 +79,7 @@ macro_rules! load_store_test{
                 let val = val;
                 check_val!($array,val,check_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",idx,val,check_val);
+                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
                 }
             }
 
@@ -113,7 +113,7 @@ macro_rules! load_store_test{
                 let val = val;
                 check_val!($array,val,check_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",idx,val,check_val);
+                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
                 }
             }
 
@@ -149,7 +149,7 @@ macro_rules! load_store_test{
                     let val = val;
                 check_val!($array,val,check_val,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",idx,val,check_val);
+                        eprintln!("{:?} {:?} {:?}",idx,val,check_val);
                     }
                 }
 
diff --git a/tests/array/atomic_ops/swap_test.rs b/tests/array/atomic_ops/swap_test.rs
index daec218a..969f30a6 100644
--- a/tests/array/atomic_ops/swap_test.rs
+++ b/tests/array/atomic_ops/swap_test.rs
@@ -73,7 +73,7 @@ macro_rules! swap{
                 let val =  world.block_on(req);
                 check_val!($array,val,init_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",idx,val,init_val);
+                    eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                 }
             }
 
@@ -90,7 +90,7 @@ macro_rules! swap{
                 let val = val;
                 check_val!($array,val,check_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",idx,val,check_val);
+                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
                 }
             }
 
@@ -117,7 +117,7 @@ macro_rules! swap{
                 let val =  world.block_on(req);
                 check_val!($array,val,init_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",idx,val,init_val);
+                    eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                 }
             }
 
@@ -134,7 +134,7 @@ macro_rules! swap{
                 let val = val;
                 check_val!($array,val,check_val,success);
                 if !success{
-                    println!("{:?} {:?} {:?}",idx,val,check_val);
+                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
                 }
             }
 
@@ -163,7 +163,7 @@ macro_rules! swap{
                     let val =  world.block_on(req);
                     check_val!($array,val,init_val,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",idx,val,init_val);
+                        eprintln!("{:?} {:?} {:?}",idx,val,init_val);
                     }
                 }
 
@@ -180,7 +180,7 @@ macro_rules! swap{
                     let val = val;
                 check_val!($array,val,check_val,success);
                     if !success{
-                        println!("{:?} {:?} {:?}",idx,val,check_val);
+                        eprintln!("{:?} {:?} {:?}",idx,val,check_val);
                     }
                 }
 
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 3de097ec..6b3bb5b8 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -78,7 +78,7 @@ macro_rules! and_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             array.barrier();
@@ -105,7 +105,7 @@ macro_rules! and_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             sub_array.barrier();
@@ -133,7 +133,7 @@ macro_rules! and_test{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
-                        println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                        eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/fetch_and_test.rs b/tests/array/bitwise_ops/fetch_and_test.rs
index 253b495c..41b3faad 100644
--- a/tests/array/bitwise_ops/fetch_and_test.rs
+++ b/tests/array/bitwise_ops/fetch_and_test.rs
@@ -74,7 +74,7 @@ macro_rules! fetch_and_test{
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
                 if (val & !my_val) != !my_val{
-                    println!("1. {:?} {:x} {:x} {:x} {:x}",idx,my_val,!my_val,val,(val & !my_val));
+                    eprintln!("1. {:?} {:x} {:x} {:x} {:x}",idx,my_val,!my_val,val,(val & !my_val));
                     success = false;
                 }
             }
@@ -87,7 +87,7 @@ macro_rules! fetch_and_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("2. {:?} {:x} {:x} {:x} {:x}",i,my_val,!my_val,val,final_val);
+                    eprintln!("2. {:?} {:x} {:x} {:x} {:x}",i,my_val,!my_val,val,final_val);
                 }
             }
             array.barrier();
@@ -111,7 +111,7 @@ macro_rules! fetch_and_test{
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
                 if (val & !my_val) != !my_val{
-                    println!("{:?} {:x} {:x} {:x}",idx,my_val,val,(val & !my_val));
+                    eprintln!("{:?} {:x} {:x} {:x}",idx,my_val,val,(val & !my_val));
                     success = false;
                 }
             }
@@ -124,7 +124,7 @@ macro_rules! fetch_and_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             sub_array.barrier();
@@ -149,7 +149,7 @@ macro_rules! fetch_and_test{
                 for (req,idx) in reqs{
                     let val =  world.block_on(req);
                     if (val & !my_val) != !my_val{
-                        println!("{:?} {:x} {:x} {:x}",idx,my_val,val,(val & !my_val));
+                        eprintln!("{:?} {:x} {:x} {:x}",idx,my_val,val,(val & !my_val));
                         success = false;
                     }
                 }
@@ -162,7 +162,7 @@ macro_rules! fetch_and_test{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
-                        println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                        eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/fetch_or_test.rs b/tests/array/bitwise_ops/fetch_or_test.rs
index 12673c6a..5b648f42 100644
--- a/tests/array/bitwise_ops/fetch_or_test.rs
+++ b/tests/array/bitwise_ops/fetch_or_test.rs
@@ -74,7 +74,7 @@ macro_rules! fetch_or_test{
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
                 if (val & my_val) != 0 {
-                    println!("{:?} {:x} {:x} ",idx,my_val,val);
+                    eprintln!("{:?} {:x} {:x} ",idx,my_val,val);
                     success = false;
                 }
             }
@@ -87,7 +87,7 @@ macro_rules! fetch_or_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             array.barrier();
@@ -111,7 +111,7 @@ macro_rules! fetch_or_test{
             for (req,idx)  in reqs{
                 let val =  world.block_on(req);
                 if (val & my_val) != 0 {
-                    println!("{:?} {:x} {:x} ",idx,my_val,val);
+                    eprintln!("{:?} {:x} {:x} ",idx,my_val,val);
                     success = false;
                 }
             }
@@ -124,7 +124,7 @@ macro_rules! fetch_or_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             sub_array.barrier();
@@ -149,7 +149,7 @@ macro_rules! fetch_or_test{
                 for (req,idx)  in reqs{
                     let val =  world.block_on(req);
                     if (val & my_val) != 0 {
-                        println!("{:?} {:x} {:x} ",idx,my_val,val);
+                        eprintln!("{:?} {:x} {:x} ",idx,my_val,val);
                         success = false;
                     }
                 }
@@ -162,7 +162,7 @@ macro_rules! fetch_or_test{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
-                        println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                        eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/fetch_xor_test.rs b/tests/array/bitwise_ops/fetch_xor_test.rs
index a8824169..edcfe642 100644
--- a/tests/array/bitwise_ops/fetch_xor_test.rs
+++ b/tests/array/bitwise_ops/fetch_xor_test.rs
@@ -74,7 +74,7 @@ macro_rules! fetch_xor_test{
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
                 if (val & my_val) != 0 {
-                    println!("{:?} {:x} {:x} ",idx,my_val,val);
+                    eprintln!("{:?} {:x} {:x} ",idx,my_val,val);
                     success = false;
                 }
             }
@@ -87,7 +87,7 @@ macro_rules! fetch_xor_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             array.barrier();
@@ -111,7 +111,7 @@ macro_rules! fetch_xor_test{
             for (req,idx)  in reqs{
                 let val =  world.block_on(req);
                 if (val & my_val) != 0 {
-                    println!("{:?} {:x} {:x} ",idx,my_val,val);
+                    eprintln!("{:?} {:x} {:x} ",idx,my_val,val);
                     success = false;
                 }
             }
@@ -124,7 +124,7 @@ macro_rules! fetch_xor_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             sub_array.barrier();
@@ -149,7 +149,7 @@ macro_rules! fetch_xor_test{
                 for (req,idx)  in reqs{
                     let val =  world.block_on(req);
                     if (val & my_val) != 0 {
-                        println!("{:?} {:x} {:x} ",idx,my_val,val);
+                        eprintln!("{:?} {:x} {:x} ",idx,my_val,val);
                         success = false;
                     }
                 }
@@ -162,7 +162,7 @@ macro_rules! fetch_xor_test{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
-                        println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                        eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index 3c3b37fb..5d8a8f25 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -78,7 +78,7 @@ macro_rules! or_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             array.barrier();
@@ -105,7 +105,7 @@ macro_rules! or_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             sub_array.barrier();
@@ -133,7 +133,7 @@ macro_rules! or_test{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
-                        println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                        eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/xor_test.rs b/tests/array/bitwise_ops/xor_test.rs
index cb4820a1..6d93284a 100644
--- a/tests/array/bitwise_ops/xor_test.rs
+++ b/tests/array/bitwise_ops/xor_test.rs
@@ -78,7 +78,7 @@ macro_rules! xor_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             array.barrier();
@@ -105,7 +105,7 @@ macro_rules! xor_test{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
-                    println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                    eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                 }
             }
             sub_array.barrier();
@@ -133,7 +133,7 @@ macro_rules! xor_test{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
-                        println!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
+                        eprintln!("{:?} {:x} {:x} {:x}",i,my_val,val,final_val);
                     }
                 }
                 sub_array.barrier();
diff --git a/tests/array/rdma/blocking_get_test.rs b/tests/array/rdma/blocking_get_test.rs
index ea32b569..74217ca8 100644
--- a/tests/array/rdma/blocking_get_test.rs
+++ b/tests/array/rdma/blocking_get_test.rs
@@ -146,7 +146,7 @@ macro_rules! blocking_get_test{
                 unsafe{
                     for (i,elem) in shared_mem_region.as_slice().unwrap().iter().enumerate().take( num_txs * tx_size){
                         if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                            println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                            eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                             success = false;
                         }
                     }
@@ -182,7 +182,7 @@ macro_rules! blocking_get_test{
                 unsafe{
                     for (i,elem) in shared_mem_region.as_slice().unwrap().iter().enumerate().take( num_txs * tx_size){
                         if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                            println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                            eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                             success = false;
                         }
                     }
@@ -221,7 +221,7 @@ macro_rules! blocking_get_test{
                     unsafe{
                         for (i,elem) in shared_mem_region.as_slice().unwrap().iter().enumerate().take( num_txs * tx_size){
                             if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                                println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                                eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                                 success = false;
                             }
                         }
diff --git a/tests/array/rdma/get_test.rs b/tests/array/rdma/get_test.rs
index 18641567..886ecd6a 100644
--- a/tests/array/rdma/get_test.rs
+++ b/tests/array/rdma/get_test.rs
@@ -150,7 +150,7 @@ macro_rules! get_test{
                 unsafe{
                     for (i,elem) in shared_mem_region.as_slice().unwrap().iter().enumerate().take( num_txs * tx_size){
                         if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                            println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                            eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                             success = false;
                         }
                     }
@@ -190,7 +190,7 @@ macro_rules! get_test{
                 unsafe{
                     for (i,elem) in shared_mem_region.as_slice().unwrap().iter().enumerate().take( num_txs * tx_size){
                         if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                            println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                            eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                             success = false;
                         }
                     }
@@ -234,7 +234,7 @@ macro_rules! get_test{
                     unsafe{
                         for (i,elem) in shared_mem_region.as_slice().unwrap().iter().enumerate().take( num_txs * tx_size){
                             if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                                println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                                eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                                 success = false;
                             }
                         }
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index ff66fd62..22aa8185 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -64,7 +64,7 @@ macro_rules! put_test{
                 #[allow(unused_unsafe)]
                 for (i,elem) in unsafe { array.onesided_iter().into_iter().enumerate().take( num_txs * tx_size) }{
                     if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                        println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                        eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                         success = false;
                     }
                 }
@@ -98,7 +98,7 @@ macro_rules! put_test{
                 #[allow(unused_unsafe)]
                 for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate().take( num_txs * tx_size)}{
                     if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                        println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                        eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                         success = false;
                     }
                 }
@@ -135,7 +135,7 @@ macro_rules! put_test{
                     #[allow(unused_unsafe)]
                     for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate().take( num_txs * tx_size)}{
                         if ((i as $t - *elem) as f32).abs() > 0.0001 {
-                            println!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
+                            eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                             success = false;
                         }
                     }

From 3c64aeef5d1683e8f85b8b436c619cdcc91b5cd0 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 29 Mar 2024 15:00:33 -0700
Subject: [PATCH 017/116] adding async_std executor, experimenting with
 alternative work stealing

---
 Cargo.toml                                    |   1 +
 examples/hello_world/hello_world_array.rs     |  36 +-
 .../batching/simple_batcher.rs                |   8 +-
 .../batching/team_am_batcher.rs               |   8 +-
 .../registered_active_message.rs              |   2 +-
 src/darc.rs                                   |  68 ++-
 src/lamellae/command_queues.rs                |  28 +-
 src/lamellae/rofi_lamellae.rs                 |   6 +-
 src/lamellar_world.rs                         |   6 +
 src/scheduler.rs                              | 123 ++++
 src/scheduler/async_std_executor.rs           |  85 +++
 src/scheduler/tokio_executor.rs               |   9 +
 src/scheduler/work_stealing.rs                |  17 +
 src/scheduler/work_stealing2.rs               | 524 ++++++++++++++++++
 src/scheduler/work_stealing3.rs               | 383 +++++++++++++
 15 files changed, 1244 insertions(+), 60 deletions(-)
 create mode 100644 src/scheduler/async_std_executor.rs
 create mode 100644 src/scheduler/work_stealing2.rs
 create mode 100644 src/scheduler/work_stealing3.rs

diff --git a/Cargo.toml b/Cargo.toml
index f521bbdc..6af7798b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,6 +45,7 @@ glob = "0.3.0"
 thread_local = "1.1.4"
 tokio = { version = "1.35.1", features = ["full"] , optional = true}
 libc = { version = "0.2.137", optional = true }
+async-global-executor = "2.4.1"
 
 
 
diff --git a/examples/hello_world/hello_world_array.rs b/examples/hello_world/hello_world_array.rs
index ff81ea29..0241029b 100644
--- a/examples/hello_world/hello_world_array.rs
+++ b/examples/hello_world/hello_world_array.rs
@@ -4,28 +4,50 @@
 /// --------------------------------------------------------------------
 use lamellar::array::prelude::*;
 
+use lamellar::RemoteMemoryRegion;
+
 fn main() {
+    let timer = std::time::Instant::now();
     let world = lamellar::LamellarWorldBuilder::new().build();
     let num_pes = world.num_pes();
     let _my_pe = world.my_pe();
-    let local_length = 10; //if you want to ensure each thread processes data make this >= LAMELLAR_THREADS environment variable
+    let local_length = 1_000_000_000; //if you want to ensure each thread processes data make this >= LAMELLAR_THREADS environment variable
     let global_length = num_pes * local_length;
+    let init_time = timer.elapsed();
+    println!("init_time: {:?}", init_time);
+
+    let timer = std::time::Instant::now();
+    let local_vec = vec![0usize; local_length];
+    let local_vec_time = timer.elapsed();
+    println!("local_vec_time: {:?}", local_vec_time);
 
+    let timer = std::time::Instant::now();
     let array = AtomicArray::<usize>::new(world.team(), global_length, Distribution::Block);
+    let array_time = timer.elapsed();
+    println!("array_time: {:?}", array_time);
+
+    let timer = std::time::Instant::now();
+    let one_sided = world.alloc_one_sided_mem_region::<usize>(local_length);
+    let one_sided_time = timer.elapsed();
+    println!("one_sided_time: {:?}", one_sided_time);
 
     //print local data on each pe
-    array.print();
-    println!("");
+    // array.print();
+    // println!("");
 
+    let timer = std::time::Instant::now();
     //add 1 to each element of array
-    for i in 0..global_length {
-        let _ = array.add(i, 1);
-    }
+    // for i in 0..global_length {
+    let _ = array.batch_add(0, &local_vec[0..100]);
+    // }
     //wait for all the local add operations to finish
     array.wait_all();
     //wait for all the PEs to finish
     array.barrier();
 
+    let add_time = timer.elapsed();
+    println!("add_time: {:?}", add_time);
+
     //print local data on each PE (should now be equal to num_pes)
-    array.print();
+    // array.print();
 }
diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index 7f7a709b..2be42094 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -93,7 +93,7 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_remote_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
@@ -148,7 +148,7 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_rerturn_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
@@ -205,7 +205,7 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_data_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
@@ -249,7 +249,7 @@ impl Batcher for SimpleBatcher {
             //     "[{:?}] add_unit_am_to_batch submit task",
             //     std::thread::current().id()
             // );
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index 356ac6bb..c6b25481 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -205,7 +205,7 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
@@ -267,7 +267,7 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
@@ -336,7 +336,7 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
@@ -387,7 +387,7 @@ impl Batcher for TeamAmBatcher {
             //     std::thread::current().id()
             // );
             let cur_stall_mark = self.stall_mark.clone();
-            self.executor.submit_task(async move {
+            self.executor.submit_io_task(async move {
                 while stall_mark != cur_stall_mark.load(Ordering::SeqCst)
                     && batch.size.load(Ordering::SeqCst) < MAX_BATCH_SIZE
                     && batch_id == batch.batch_id.load(Ordering::SeqCst)
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index e16755d7..15a23040 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -114,7 +114,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                     let ame = self.clone();
                     let req_data_clone = req_data.clone();
                     let am_clone = am.clone();
-                    self.executor.submit_task(async move {
+                    self.executor.submit_io_task(async move {
                         //spawn a task so that we can the execute the local am immediately
                         // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
                         if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
diff --git a/src/darc.rs b/src/darc.rs
index f476cdf2..d5ad86c4 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -46,6 +46,7 @@
 ///```
 use async_lock::RwLock;
 use core::marker::PhantomData;
+use futures_util::future::join_all;
 use serde::{Deserialize, Deserializer};
 use std::cmp::PartialEq;
 use std::fmt;
@@ -458,11 +459,17 @@ impl<T> DarcInner<T> {
 
             while outstanding_refs {
                 outstanding_refs = false;
+                // these hopefully all get set to non zero later otherwise we still need to wait
+                for id in &mut *barrier_slice {
+                    *id = 0;
+                }
                 let old_barrier_id = barrier_id; //we potentially will set barrier_id to 0 but want to maintiain the previously highest value
                 while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
                     async_std::task::yield_now().await;
                 }
-                inner.send_finished();
+                join_all(inner.send_finished()).await;
+                let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
+                barrier_fut.await;
 
                 let mut old_ref_cnts = ref_cnts_slice.to_vec();
                 let old_local_cnt = inner.total_local_cnt.load(Ordering::SeqCst);
@@ -563,6 +570,7 @@ impl<T> DarcInner<T> {
                         inner.mode_barrier_addr + inner.my_pe * std::mem::size_of::<usize>(),
                     );
                 }
+                //maybe we need to change the above to a get?
                 rdma.flush();
                 let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
                 barrier_fut.await;
@@ -580,15 +588,17 @@ impl<T> DarcInner<T> {
                 //     dist_cnts_changed: {dist_cnts_changed:?} barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(), inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
                 // }
                 barrier_id = old_barrier_id + 1;
-                if outstanding_refs {
-                    // println!(
-                    //     "[{:?}] still outstanding, exec a task!",
-                    //     std::thread::current().id()
-                    // );
-                    // team.scheduler.exec_task();
-                    async_std::task::yield_now().await;
-                }
+                // if outstanding_refs {
+                //     // println!(
+                //     //     "[{:?}] still outstanding, exec a task!",
+                //     //     std::thread::current().id()
+                //     // );
+                //     // team.scheduler.exec_task();
+                //     async_std::task::yield_now().await;
+                // }
                 prev_ref_cnts = old_ref_cnts;
+                let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
+                barrier_fut.await;
             }
             // println!(
             //     "[{:?}] {rel_addr:x}  all outstanding refs are resolved",
@@ -613,7 +623,7 @@ impl<T> DarcInner<T> {
                 let mut timer = std::time::Instant::now();
                 while *pe != state as u8 {
                     if inner.local_cnt.load(Ordering::SeqCst) == 1 + extra_cnt {
-                        inner.send_finished();
+                        join_all(inner.send_finished()).await;
                     }
                     if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
                         let ref_cnts_slice = unsafe {
@@ -622,22 +632,23 @@ impl<T> DarcInner<T> {
                                 inner.num_pes,
                             )
                         };
-                        println!("[{:?}][WARNING] -- Potential deadlock detected.\n\
-                    The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
-                    The object is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
-                    An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
-                    The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
-                    To view backtrace set RUST_LIB_BACKTRACE=1\n\
-                    {}",
-                    std::thread::current().id(),
-                    unsafe {
-                        &std::slice::from_raw_parts_mut(inner.mode_addr as *mut DarcMode, inner.num_pes)
-                    },
-                    inner.local_cnt.load(Ordering::SeqCst),
-                    inner.dist_cnt.load(Ordering::SeqCst),
-                    *crate::DEADLOCK_TIMEOUT,
-                    std::backtrace::Backtrace::capture()
-                );
+                        println!("[{:?}][{:?}][WARNING] -- Potential deadlock detected.\n\
+                            The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
+                            The object is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
+                            An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
+                            The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                            To view backtrace set RUST_LIB_BACKTRACE=1\n\
+                            {}",
+                            inner.my_pe,
+                            std::thread::current().id(),
+                            unsafe {
+                                &std::slice::from_raw_parts_mut(inner.mode_addr as *mut DarcMode, inner.num_pes)
+                            },
+                            inner.local_cnt.load(Ordering::SeqCst),
+                            inner.dist_cnt.load(Ordering::SeqCst),
+                            *crate::DEADLOCK_TIMEOUT,
+                            std::backtrace::Backtrace::capture()
+                        );
                         timer = std::time::Instant::now();
                     }
                     async_std::task::yield_now().await;
@@ -1393,7 +1404,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                 async_std::task::yield_now().await;
 
                 if wrapped.local_cnt.load(Ordering::SeqCst) == 0 {
-                    wrapped.send_finished();
+                    join_all(wrapped.send_finished()).await;
                 }
 
                 if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
@@ -1434,7 +1445,8 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                 || wrapped.local_cnt.load(Ordering::SeqCst) != 0
             {
                 if wrapped.local_cnt.load(Ordering::SeqCst) == 0 {
-                    wrapped.send_finished();
+                    // wrapped.send_finished()
+                    join_all(wrapped.send_finished()).await;
                 }
                 if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
                     let ref_cnts_slice = std::slice::from_raw_parts_mut(
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 56b2897f..acd1f3c6 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -5,6 +5,7 @@ use crate::lamellae::{
 use crate::scheduler::Scheduler;
 
 use parking_lot::Mutex;
+use thread_local::ThreadLocal;
 
 use std::collections::HashMap;
 use std::num::Wrapping;
@@ -17,6 +18,10 @@ const CMD_BUF_LEN: usize = 50000; // this is the number of slots for each PE
                                   // const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
 const CMD_BUFS_PER_PE: usize = 2;
 
+// lazy_static! {
+//     static ref CNTS: ThreadLocal<AtomicUsize> = ThreadLocal::new();
+// }
+
 #[repr(C)]
 #[derive(Clone, Copy)]
 struct CmdMsg {
@@ -701,14 +706,7 @@ impl InnerCQ {
 
     //#[tracing::instrument(skip_all)]
     async fn send(&self, addr: usize, len: usize, dst: usize, hash: usize) {
-        // if len > 1000000000{
-        //     println!("wayyyyyy toooooo big!!!");
-        // }
         let mut timer = std::time::Instant::now();
-        // while self.active_cnt.load(Ordering::SeqCst) > 4 {
-        //     async_std::task::yield_now().await;
-        // }
-        // self.active_cnt.fetch_add(1, Ordering::SeqCst);
         self.pending_cmds.fetch_add(1, Ordering::SeqCst);
         while self.active.load(Ordering::SeqCst) != CmdQStatus::Panic as u8 {
             {
@@ -719,12 +717,9 @@ impl InnerCQ {
                 // let mut cmd_buffer = trace_span!("lock").in_scope(|| self.cmd_buffers[dst].lock());
                 let mut cmd_buffer = self.cmd_buffers[dst].lock();
                 if cmd_buffer.try_push(addr, len, hash) {
-                    // let data_slice = unsafe{ std::slice::from_raw_parts((addr + self.comm.base_addr()) as *const u8, len) };
                     self.sent_cnt.fetch_add(1, Ordering::SeqCst);
                     self.put_amt.fetch_add(len, Ordering::Relaxed);
                     let _cnt = self.pending_cmds.fetch_sub(1, Ordering::SeqCst);
-                    // println!("pushed {:?} {:?} {:?} {:?}", addr, len, hash, _cnt); //, data_slice);
-                    // println!("cmd_buffer {:?}", cmd_buffer);
                     break;
                 }
                 // let span1 = trace_span!("send loop 1.1");
@@ -1365,7 +1360,7 @@ impl CommandQueue {
             // scheduler.force_shutdown();
             panic!("received panic from other PE");
         }
-        // println!("leaving alloc_task task {:?}", scheduler.active());
+        // println!("leaving panic_task task {:?}", scheduler.active());
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1377,6 +1372,8 @@ impl CommandQueue {
             || !self.cq.empty()
             || scheduler.active()
         {
+            // CNTS.get_or(|| AtomicUsize::new(0))
+            //     .fetch_add(1, Ordering::Relaxed);
             for src in 0..num_pes {
                 if src != my_pe {
                     if let Some(cmd_buf_cmd) = self.cq.ready(src) {
@@ -1450,7 +1447,7 @@ impl CommandQueue {
                                             //     "[{:?}] recv_data submitting get command task",
                                             //     std::thread::current().id(),
                                             // );
-                                            scheduler1.submit_task(task);
+                                            scheduler1.submit_io_task(task);
                                             i += 1;
                                         } else {
                                             panic!(
@@ -1466,7 +1463,7 @@ impl CommandQueue {
                                 //     "[{:?}] recv_data submitting tx task",
                                 //     std::thread::current().id()
                                 // );
-                                scheduler.submit_task(task);
+                                scheduler.submit_io_task(task);
                             }
                         }
                     }
@@ -1497,6 +1494,11 @@ impl CommandQueue {
         // );
         self.active
             .store(CmdQStatus::Finished as u8, Ordering::SeqCst);
+        // println!("recv_data thread shutting down");
+        // for cnt in CNTS.iter() {
+        //     print!("{:?} ", cnt.load(Ordering::Relaxed));
+        // }
+        // println!("");
     }
 
     //#[tracing::instrument(skip_all)]
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index de09a278..fdd33ec5 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -47,19 +47,19 @@ impl LamellaeInit for RofiBuilder {
         let rofi = Arc::new(Lamellae::Rofi(rofi));
         let rofi_clone = rofi.clone();
         // println!("Submitting Rofi Tasks");
-        scheduler.submit_task(async move {
+        scheduler.submit_io_task(async move {
             // println!("ROFI RECV DATA TASK");
             cq_clone
                 .recv_data(scheduler_clone.clone(), rofi_clone.clone())
                 .await;
             // println!("ROFI RECV DATA DONE");
         });
-        scheduler.submit_task(async move {
+        scheduler.submit_io_task(async move {
             // println!("ROFI ALLOC TASK");
             cq_clone2.alloc_task(scheduler_clone2.clone()).await;
             // println!("ROFI ALLOC DONE");
         });
-        scheduler.submit_task(async move {
+        scheduler.submit_io_task(async move {
             // println!("ROFI PANIC TASK");
             cq_clone3.panic_task(scheduler_clone3.clone()).await;
             // println!("ROFI PANIC DONE");
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 2ac4f6ad..e552578c 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -396,6 +396,12 @@ impl LamellarWorldBuilder {
                         println!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled,  defaulting to lamellar work stealing executor");
                         ExecutorType::LamellarWorkStealing
                     }
+                } else if executor == 2 {
+                    ExecutorType::LamellarWorkStealing2
+                } else if executor == 3 {
+                    ExecutorType::LamellarWorkStealing3
+                } else if executor == 4 {
+                    ExecutorType::AsyncStd
                 } else {
                     println!("[LAMELLAR WARNING]: invalid executor selected defaulting to lamellar work stealing executor");
                     ExecutorType::LamellarWorkStealing
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 82f44cf6..d74c2bdb 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -9,10 +9,20 @@ use enum_dispatch::enum_dispatch;
 use futures_util::Future;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::thread;
 
 pub(crate) mod work_stealing;
 use work_stealing::WorkStealing;
 
+pub(crate) mod work_stealing2;
+use work_stealing2::WorkStealing2;
+
+pub(crate) mod work_stealing3;
+use work_stealing3::WorkStealing3;
+
+pub(crate) mod async_std_executor;
+use async_std_executor::AsyncStdRt;
+
 #[cfg(feature = "tokio-executor")]
 pub(crate) mod tokio_executor;
 #[cfg(feature = "tokio-executor")]
@@ -35,6 +45,15 @@ pub(crate) enum SchedulerStatus {
 // pub(crate) mod numa_work_stealing2;
 // use numa_work_stealing2::{NumaWorkStealing2, NumaWorkStealing2Inner};
 
+// static AM_SAME_THREAD: AtomicUsize = AtomicUsize::new(0);
+// static AM_DIFF_THREAD: AtomicUsize = AtomicUsize::new(0);
+
+// static TASK_SAME_THREAD: AtomicUsize = AtomicUsize::new(0);
+// static TASK_DIFF_THREAD: AtomicUsize = AtomicUsize::new(0);
+
+// static IO_SAME_THREAD: AtomicUsize = AtomicUsize::new(0);
+// static IO_DIFF_THREAD: AtomicUsize = AtomicUsize::new(0);
+
 #[derive(
     Copy,
     Clone,
@@ -59,6 +78,9 @@ pub(crate) struct ReqId {
 pub enum ExecutorType {
     /// The default work stealing executor
     LamellarWorkStealing,
+    LamellarWorkStealing2,
+    LamellarWorkStealing3,
+    AsyncStd,
     #[cfg(feature = "tokio-executor")]
     /// The tokio executor
     Tokio,
@@ -72,6 +94,11 @@ pub(crate) trait LamellarExecutor {
         F: Future + Send + 'static,
         F::Output: Send;
 
+    fn submit_io_task<F>(&self, future: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send;
+
     fn submit_immediate_task<F>(&self, future: F)
     where
         F: Future + Send + 'static,
@@ -96,6 +123,9 @@ pub(crate) trait LamellarExecutor {
 #[derive(Debug)]
 pub(crate) enum Executor {
     WorkStealing(WorkStealing),
+    WorkStealing2(WorkStealing2),
+    WorkStealing3(WorkStealing3),
+    AsyncStd(AsyncStdRt),
     #[cfg(feature = "tokio-executor")]
     Tokio(TokioRt),
 }
@@ -139,12 +169,18 @@ impl Scheduler {
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
         let am_future = async move {
+            // let start_tid = thread::current().id();
             num_ams.fetch_add(1, Ordering::Relaxed);
             let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
             // println!("[{:?}] submit_am {:?}", std::thread::current().id(), am_id);
             ame.process_msg(am, am_stall_mark, false).await;
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // if thread::current().id() != start_tid {
+            //     AM_DIFF_THREAD.fetch_add(1, Ordering::Relaxed);
+            // } else {
+            //     AM_SAME_THREAD.fetch_add(1, Ordering::Relaxed);
+            // }
             // println!(
             //     "[{:?}] submit_am_done {:?}",
             //     std::thread::current().id(),
@@ -162,6 +198,7 @@ impl Scheduler {
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
         let am_future = async move {
+            // let start_tid = thread::current().id();
             num_ams.fetch_add(1, Ordering::Relaxed);
             let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
@@ -172,6 +209,11 @@ impl Scheduler {
             // );
             ame.process_msg(am, am_stall_mark, false).await;
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // if thread::current().id() != start_tid {
+            //     AM_DIFF_THREAD.fetch_add(1, Ordering::Relaxed);
+            // } else {
+            //     AM_SAME_THREAD.fetch_add(1, Ordering::Relaxed);
+            // }
             // println!(
             //     "[{:?}] submit_am_immediate done {:?}",
             //     std::thread::current().id(),
@@ -187,6 +229,7 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let ame = self.active_message_engine.clone();
         let am_future = async move {
+            // let start_tid = std::thread::current().id();
             num_ams.fetch_add(1, Ordering::Relaxed);
             let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
@@ -203,6 +246,11 @@ impl Scheduler {
                 panic!("should i be here?");
             }
             num_ams.fetch_sub(1, Ordering::Relaxed);
+            // if start_tid == std::thread::current().id() {
+            //     AM_SAME_THREAD.fetch_add(1, Ordering::Relaxed);
+            // } else {
+            //     AM_DIFF_THREAD.fetch_add(1, Ordering::Relaxed);
+            // }
             // println!(
             //     "[{:?}] submit_remote_am done {:?}",
             //     std::thread::current().id(),
@@ -220,6 +268,7 @@ impl Scheduler {
         let num_tasks = self.num_tasks.clone();
         let max_tasks = self.max_tasks.clone();
         let future = async move {
+            // let start_tid = std::thread::current().id();
             num_tasks.fetch_add(1, Ordering::Relaxed);
             let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
             // println!(
@@ -234,6 +283,11 @@ impl Scheduler {
             //     std::thread::current().id(),
             //     task_id
             // );
+            // if start_tid == std::thread::current().id() {
+            //     TASK_SAME_THREAD.fetch_add(1, Ordering::Relaxed);
+            // } else {
+            //     TASK_DIFF_THREAD.fetch_add(1, Ordering::Relaxed);
+            // }
         };
         self.executor.submit_task(future);
     }
@@ -245,6 +299,7 @@ impl Scheduler {
         let num_tasks = self.num_tasks.clone();
         let max_tasks = self.max_tasks.clone();
         let future = async move {
+            // let start_tid = std::thread::current().id();
             num_tasks.fetch_add(1, Ordering::Relaxed);
             let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
             // println!(
@@ -259,10 +314,47 @@ impl Scheduler {
             //     std::thread::current().id(),
             //     task_id
             // );
+            // if start_tid == std::thread::current().id() {
+            //     TASK_SAME_THREAD.fetch_add(1, Ordering::Relaxed);
+            // } else {
+            //     TASK_DIFF_THREAD.fetch_add(1, Ordering::Relaxed);
+            // }
         };
         self.executor.submit_immediate_task(future);
     }
 
+    pub(crate) fn submit_io_task<F>(&self, task: F)
+    where
+        F: Future<Output = ()> + Send + 'static,
+    {
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        let future = async move {
+            // let start_tid = std::thread::current().id();
+            num_tasks.fetch_add(1, Ordering::Relaxed);
+            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] execing new task {:?}",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
+            task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+            // println!(
+            //     "[{:?}] done new task {:?} ",
+            //     std::thread::current().id(),
+            //     task_id
+            // );
+            // if start_tid == std::thread::current().id() {
+            //     IO_SAME_THREAD.fetch_add(1, Ordering::Relaxed);
+            // } else {
+            //     IO_DIFF_THREAD.fetch_add(1, Ordering::Relaxed);
+            // }
+        };
+
+        self.executor.submit_io_task(future);
+    }
+
     pub(crate) fn exec_task(&self) {
         // if std::thread::current().id() == *crate::MAIN_THREAD {
         self.executor.exec_task();
@@ -290,6 +382,14 @@ impl Scheduler {
     }
 
     pub(crate) fn active(&self) -> bool {
+        // if self.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8 {
+        //     println!(
+        //         "active: {:?} {:?}",
+        //         self.status.load(Ordering::SeqCst),
+        //         self.num_tasks.load(Ordering::SeqCst)
+        //     );
+        // }
+
         self.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
             || self.num_tasks.load(Ordering::SeqCst) > 3 // the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
     }
@@ -317,6 +417,21 @@ impl Scheduler {
             std::thread::yield_now()
         }
         self.executor.shutdown();
+        // println!(
+        //     "AM_SAME: {:?} AM_DIFF: {:?}",
+        //     AM_SAME_THREAD.load(Ordering::Relaxed),
+        //     AM_DIFF_THREAD.load(Ordering::Relaxed)
+        // );
+        // println!(
+        //     "TASK_SAME: {:?} TASK_DIFF: {:?}",
+        //     TASK_SAME_THREAD.load(Ordering::Relaxed),
+        //     TASK_DIFF_THREAD.load(Ordering::Relaxed)
+        // );
+        // println!(
+        //     "IO_SAME: {:?} IO_DIFF: {:?}",
+        //     IO_SAME_THREAD.load(Ordering::Relaxed),
+        //     IO_DIFF_THREAD.load(Ordering::Relaxed)
+        // );
     }
     pub(crate) fn force_shutdown(&self) {
         self.status
@@ -337,6 +452,14 @@ pub(crate) fn create_scheduler(
         ExecutorType::LamellarWorkStealing => {
             WorkStealing::new(num_workers, status.clone(), panic.clone()).into()
         }
+        ExecutorType::LamellarWorkStealing2 => {
+            WorkStealing2::new(num_workers, status.clone(), panic.clone()).into()
+        }
+        ExecutorType::LamellarWorkStealing3 => {
+            WorkStealing3::new(num_workers, status.clone(), panic.clone()).into()
+        }
+        ExecutorType::AsyncStd => AsyncStdRt::new(num_workers).into(),
+
         #[cfg(feature = "tokio-executor")]
         ExecutorType::Tokio => TokioRt::new(num_workers).into(),
     });
diff --git a/src/scheduler/async_std_executor.rs b/src/scheduler/async_std_executor.rs
new file mode 100644
index 00000000..1de78415
--- /dev/null
+++ b/src/scheduler/async_std_executor.rs
@@ -0,0 +1,85 @@
+use crate::scheduler::LamellarExecutor;
+
+use async_std::task;
+
+use futures_util::Future;
+
+#[derive(Debug)]
+pub(crate) struct AsyncStdRt {
+    max_num_threads: usize,
+}
+
+impl LamellarExecutor for AsyncStdRt {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        task::spawn(async move { task.await });
+        // });
+    }
+    fn submit_io_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        task::spawn(async move { task.await });
+        // });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        task::spawn(async move { task.await });
+        // });
+    }
+
+    fn block_on<F: Future>(&self, task: F) -> F::Output {
+        // trace_span!("block_on").in_scope(||
+        task::block_on(task)
+        // )
+    }
+
+    // #[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    // #[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // i think we just let tokio do this on drop
+    }
+
+    // #[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        // I dont think tokio has a way to do this
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl AsyncStdRt {
+    pub(crate) fn new(num_workers: usize) -> AsyncStdRt {
+        // println!("New TokioRT with {} workers", num_workers);
+        async_global_executor::init_with_config(
+            async_global_executor::GlobalExecutorConfig::default()
+                .with_min_threads(num_workers)
+                .with_max_threads(num_workers)
+                .with_thread_name_fn(Box::new(|| "lamellar_worker".to_string())),
+        );
+        Self {
+            max_num_threads: num_workers + 1,
+        }
+    }
+}
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index de44e311..de5f3b86 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -20,6 +20,15 @@ impl LamellarExecutor for TokioRt {
         self.rt.spawn(async move { task.await });
         // });
     }
+    fn submit_io_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        self.rt.spawn(async move { task.await });
+        // });
+    }
 
     fn submit_immediate_task<F>(&self, task: F)
     where
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index b53651d9..8a078558 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -146,6 +146,23 @@ impl LamellarExecutor for WorkStealing {
         // });
     }
 
+    fn submit_io_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.schedule();
+        task.detach();
+        // });
+    }
+
     fn submit_immediate_task<F>(&self, task: F)
     where
         F: Future + Send + 'static,
diff --git a/src/scheduler/work_stealing2.rs b/src/scheduler/work_stealing2.rs
new file mode 100644
index 00000000..33115003
--- /dev/null
+++ b/src/scheduler/work_stealing2.rs
@@ -0,0 +1,524 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+use crate::MAIN_THREAD;
+
+//use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use crossbeam::deque::{Injector, Stealer, Worker};
+use futures_util::Future;
+use rand::distributions::Uniform;
+use rand::prelude::*;
+use std::collections::HashMap;
+use std::panic;
+use std::pin::Pin;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc;
+use std::task::Context;
+use std::task::Poll;
+//, Weak};
+use std::thread::{self, ThreadId};
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+#[derive(Debug)]
+struct TaskQueue {
+    injector: Arc<Injector<Runnable<usize>>>,
+    stealers: Vec<Stealer<Runnable<usize>>>,
+    tasks: Worker<Runnable<usize>>,
+    work_flag: Arc<AtomicU8>,
+}
+
+impl TaskQueue {
+    fn get_task(&self, t: &Uniform<usize>, rng: &mut ThreadRng) -> Option<Runnable<usize>> {
+        self.tasks.pop().or_else(|| {
+            if self
+                .work_flag
+                .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
+                == Ok(0)
+            {
+                let ret = self.injector.steal_batch_and_pop(&self.tasks).success();
+                self.work_flag.store(0, Ordering::SeqCst);
+                ret
+            } else {
+                self.stealers[t.sample(rng)].steal().success()
+            }
+        })
+    }
+
+    fn is_empty(&self) -> bool {
+        self.tasks.is_empty() && self.injector.is_empty()
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct WorkStealingThread {
+    imm_inj: Arc<Injector<Runnable<usize>>>,
+    group_queue: TaskQueue,
+    global_injs: Vec<Arc<Injector<Runnable<usize>>>>,
+    status: Arc<AtomicU8>,
+    panic: Arc<AtomicU8>,
+}
+
+impl WorkStealingThread {
+    //#[tracing::instrument(skip_all)]
+    fn run(
+        worker: WorkStealingThread,
+        active_cnt: Arc<AtomicUsize>,
+        // num_tasks: Arc<AtomicUsize>,
+        id: CoreId,
+    ) -> thread::JoinHandle<()> {
+        let builder = thread::Builder::new().name("worker_thread".into());
+        builder
+            .spawn(move || {
+                // println!("TestSchdulerWorker thread running {:?} core: {:?}", std::thread::current().id(), id);
+                // let _span = trace_span!("WorkStealingThread::run");
+                // println!(
+                //     "Woker Thread {:?} core: {:?}, global_injs: {:?}, group_queue.stealers: {:?}",
+                //     std::thread::current().id(),
+                //     id,
+                //     worker.global_injs.len(),
+                //     worker.group_queue.stealers.len()
+                // );
+                core_affinity::set_for_current(id);
+                active_cnt.fetch_add(1, Ordering::SeqCst);
+                let mut rng = rand::thread_rng();
+                let global_inj_dist = Uniform::new(0, worker.global_injs.len());
+                let group_dist = Uniform::new(0, worker.group_queue.stealers.len());
+                let mut timer = std::time::Instant::now();
+                while worker.panic.load(Ordering::SeqCst) == 0
+                    && (worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+                        || !(worker.group_queue.is_empty()
+                            && worker.imm_inj.is_empty()
+                            && worker.global_injs.iter().all(|i| i.is_empty())))
+                {
+                    let omsg = if !worker.imm_inj.is_empty() {
+                        worker.imm_inj.steal().success()
+                    } else {
+                        worker
+                            .group_queue
+                            .get_task(&group_dist, &mut rng)
+                            .or_else(|| {
+                                let i = global_inj_dist.sample(&mut rng);
+                                worker.global_injs[i]
+                                    .steal_batch_and_pop(&(worker.group_queue.tasks))
+                                    .success()
+                            })
+                    };
+
+                    if let Some(runnable) = omsg {
+                        if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        {
+                            println!("runnable {:?}", runnable);
+                            println!(
+                                "work_q size {:?} work inj size {:?}", // num_tasks {:?}",
+                                worker.group_queue.tasks.len(),
+                                worker.group_queue.injector.len(),
+                                // num_tasks.load(Ordering::SeqCst)
+                            );
+                            timer = std::time::Instant::now();
+                        }
+                        runnable.run();
+                    }
+                    if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        && !worker.group_queue.is_empty()
+                    {
+                        println!(
+                            "work_q size {:?} work inj size {:?} ", // num_tasks {:?}",
+                            worker.group_queue.tasks.len(),
+                            worker.group_queue.injector.len(),
+                            // num_tasks.load(Ordering::SeqCst)
+                        );
+                        timer = std::time::Instant::now();
+                    }
+                    std::thread::yield_now();
+                }
+                active_cnt.fetch_sub(1, Ordering::SeqCst);
+                // println!("TestSchdulerWorker thread shutting down");
+            })
+            .unwrap()
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct IoThread {
+    io_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
+    io_q: Worker<Runnable<usize>>,
+    status: Arc<AtomicU8>,
+    panic: Arc<AtomicU8>,
+}
+
+impl IoThread {
+    //#[tracing::instrument(skip_all)]
+    fn run(worker: IoThread, active_cnt: Arc<AtomicUsize>, id: CoreId) -> thread::JoinHandle<()> {
+        let builder = thread::Builder::new().name("io_thread".into());
+        builder
+            .spawn(move || {
+                core_affinity::set_for_current(id);
+                active_cnt.fetch_add(1, Ordering::SeqCst);
+                let mut timer = std::time::Instant::now();
+                while worker.panic.load(Ordering::SeqCst) == 0
+                    && (worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+                        || !(worker.io_q.is_empty() && worker.io_inj.is_empty()))
+                {
+                    let io_task = worker
+                        .io_q
+                        .pop()
+                        .or_else(|| worker.io_inj.steal_batch_and_pop(&worker.io_q).success());
+                    if let Some(runnable) = io_task {
+                        if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        {
+                            println!(
+                                "io_q size {:?} io inj size {:?} ", // num_tasks {:?}",
+                                worker.io_q.len(),
+                                worker.io_inj.len(),
+                                // num_tasks.load(Ordering::SeqCst)
+                            );
+                            timer = std::time::Instant::now();
+                        }
+                        runnable.run();
+                    }
+
+                    if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        && (worker.io_q.len() > 0 || worker.io_inj.len() > 0)
+                    {
+                        println!(
+                            "io_q size {:?} io inj size {:?} ", // num_tasks {:?}",
+                            worker.io_q.len(),
+                            worker.io_inj.len(),
+                            // num_tasks.load(Ordering::SeqCst)
+                        );
+                        timer = std::time::Instant::now();
+                    }
+                    std::thread::yield_now();
+                }
+                active_cnt.fetch_sub(1, Ordering::SeqCst);
+            })
+            .unwrap()
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct WorkStealing2 {
+    max_num_threads: usize,
+    threads: Vec<thread::JoinHandle<()>>,
+    imm_inj: Arc<Injector<Runnable<usize>>>,
+    io_inj: Arc<Injector<Runnable<usize>>>,
+    work_injs: Vec<Arc<Injector<Runnable<usize>>>>,
+    work_stealers: Vec<Stealer<Runnable<usize>>>,
+    work_flag: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
+    active_cnt: Arc<AtomicUsize>,
+    panic: Arc<AtomicU8>,
+    num_threads_per_group: usize,
+    cur_inj: Arc<AtomicU8>,
+    inj_map: HashMap<ThreadId, usize>,
+}
+
+impl LamellarExecutor for WorkStealing2 {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let work_inj = self.get_injector();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.schedule();
+        task.detach();
+        // });
+    }
+
+    fn submit_io_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let io_inj = self.get_injector();
+        let schedule = move |runnable| io_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.schedule();
+        task.detach();
+        // });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let imm_inj = self.imm_inj.clone();
+        let schedule = move |runnable| imm_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.run(); //try to run immediately
+        task.detach();
+        // });
+    }
+
+    fn block_on<F: Future>(&self, fut: F) -> F::Output {
+        // trace_span!("block_on").in_scope(|| {
+        let work_inj = self.get_injector();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, mut task) = unsafe {
+            Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn_unchecked(move |_task_id| async move { fut.await }, schedule)
+        };
+        let waker = runnable.waker();
+        runnable.run(); //try to run immediately
+        while !task.is_finished() {
+            self.exec_task(); //try to execute another task while this one is not ready
+        }
+        let cx = &mut Context::from_waker(&waker);
+        if let Poll::Ready(output) = Pin::new(&mut task).poll(cx) {
+            output
+        } else {
+            println!(
+                "[{:?}] work stealing block on failed --  task id{:?}",
+                std::thread::current().id(),
+                task.metadata()
+            );
+            panic!("task not ready");
+        }
+        // })
+    }
+
+    //#[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        while self.panic.load(Ordering::SeqCst) == 0 && self.active_cnt.load(Ordering::Relaxed) > 0
+        {
+            //num active threads
+            self.exec_task();
+            std::thread::yield_now()
+        }
+    }
+
+    //#[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // println!("work stealing shuting down {:?}", self.status());
+
+        // println!("work stealing shuting down {:?}",self.status());
+        let my_id = std::thread::current().id();
+        if self.threads.iter().any(|e| e.thread().id() == my_id) {
+            self.active_cnt.fetch_sub(1, Ordering::SeqCst); // I paniced so I wont actually decrement
+        } else {
+            while self.active_cnt.load(Ordering::Relaxed) > 0 {
+                //num active threads
+                self.exec_task();
+                std::thread::yield_now()
+            }
+        }
+        // println!(
+        //     "work stealing shut down {:?} {:?} {:?}",
+        //     self.status(),
+        //     self.active_cnt.load(Ordering::Relaxed),
+        //     self.active_cnt.load(Ordering::Relaxed)
+        // );
+    }
+
+    //#[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        let mut rng = rand::thread_rng();
+        let t = rand::distributions::Uniform::new(0, self.work_stealers.len());
+        let ret = if !self.imm_inj.is_empty() {
+            self.imm_inj.steal().success()
+        } else {
+            self.get_injector()
+                .steal()
+                .success()
+                .or_else(|| self.work_stealers[t.sample(&mut rng)].steal().success())
+        };
+        if let Some(runnable) = ret {
+            runnable.run();
+        }
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl WorkStealing2 {
+    pub(crate) fn new(
+        num_workers: usize,
+        status: Arc<AtomicU8>,
+        panic: Arc<AtomicU8>,
+    ) -> WorkStealing2 {
+        // println!("new work stealing queue");
+        let mut num_threads_per_group = match std::env::var("LAMELLAR_WS2_THREADS") {
+            Ok(s) => {
+                if let Ok(num) = s.parse::<usize>() {
+                    num
+                } else {
+                    4
+                }
+            }
+            _ => 4,
+        };
+        if num_threads_per_group > num_workers {
+            num_threads_per_group = num_workers
+        }
+
+        let mut ws = WorkStealing2 {
+            max_num_threads: num_workers,
+            threads: Vec::new(),
+            imm_inj: Arc::new(Injector::new()),
+            io_inj: Arc::new(Injector::new()),
+            work_injs: Vec::new(),
+            work_stealers: Vec::new(),
+            work_flag: Arc::new(AtomicU8::new(0)),
+            status: status,
+            active_cnt: Arc::new(AtomicUsize::new(0)),
+            panic: panic,
+            num_threads_per_group: num_threads_per_group,
+            cur_inj: Arc::new(AtomicU8::new(0)),
+            inj_map: HashMap::new(),
+        };
+        ws.init();
+        ws
+    }
+    // #[tracing::instrument(skip_all)]
+    fn init(&mut self) {
+        let mut num_groups = self.max_num_threads / self.num_threads_per_group;
+        if self.max_num_threads % self.num_threads_per_group != 0 {
+            num_groups += 1;
+        }
+
+        for _i in 0..num_groups {
+            self.work_injs.push(Arc::new(Injector::new()));
+        }
+
+        let mut work_workers = vec![];
+        for _i in 0..self.max_num_threads {
+            let work_worker: Worker<Runnable<usize>> = Worker::new_fifo();
+            self.work_stealers.push(work_worker.stealer());
+            work_workers.push(work_worker);
+        }
+
+        let orig_hook = panic::take_hook();
+        panic::set_hook(Box::new(move |panic_info| {
+            // invoke the default handler and exit the process
+            orig_hook(panic_info);
+            process::exit(1);
+        }));
+        let core_ids = match core_affinity::get_core_ids() {
+            Some(core_ids) => core_ids,
+            None => {
+                vec![core_affinity::CoreId { id: 0 }]
+            }
+        };
+        // println!("core_ids: {:?}",core_ids);
+
+        // println!(
+        //     "num threads: {} {} num_groups: {}",
+        //     self.max_num_threads,
+        //     core_ids.len(),
+        //     num_groups
+        // );
+
+        let mut thread_cnt = 0;
+        for (group_id, group_stealers) in self
+            .work_stealers
+            .chunks(self.num_threads_per_group)
+            .enumerate()
+        {
+            // println!("init group {} {:?}", group_id, group_stealers.len());
+            let work_flag = Arc::new(AtomicU8::new(0));
+            for _ in 0..group_stealers.len() {
+                let group_queue = TaskQueue {
+                    tasks: work_workers.pop().unwrap(),
+                    injector: self.work_injs[group_id].clone(),
+                    stealers: group_stealers.to_vec(),
+                    work_flag: work_flag.clone(),
+                };
+                let mut work_injs = vec![];
+                for (i, inj) in self.work_injs.iter().enumerate() {
+                    if i != group_id || num_groups == 1 {
+                        work_injs.push(inj.clone());
+                    }
+                }
+
+                let worker = WorkStealingThread {
+                    imm_inj: self.imm_inj.clone(),
+                    group_queue: group_queue,
+                    global_injs: work_injs,
+                    status: self.status.clone(),
+                    panic: self.panic.clone(),
+                };
+                let thread = WorkStealingThread::run(
+                    worker,
+                    self.active_cnt.clone(),
+                    core_ids[thread_cnt % core_ids.len()],
+                );
+                thread_cnt += 1;
+                self.inj_map.insert(thread.thread().id(), group_id);
+                self.threads.push(thread);
+            }
+        }
+
+        // let io_thread = IoThread {
+        //     io_inj: self.io_inj.clone(),
+        //     io_q: crossbeam::deque::Worker::new_fifo(),
+        //     status: self.status.clone(),
+        //     panic: self.panic.clone(),
+        // };
+        // self.threads.push(IoThread::run(
+        //     io_thread,
+        //     self.active_cnt.clone(),
+        //     core_ids[self.max_num_threads % core_ids.len()],
+        // ));
+        while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
+            std::thread::yield_now();
+        }
+    }
+
+    fn get_injector(&self) -> Arc<Injector<Runnable<usize>>> {
+        let tid = thread::current().id();
+        if tid == *MAIN_THREAD {
+            self.work_injs
+                [self.cur_inj.fetch_add(1, Ordering::Relaxed) as usize % self.work_injs.len()]
+            .clone()
+        } else {
+            self.work_injs[*self
+                .inj_map
+                .get(&tid)
+                .expect("Thread ID Should be registered")]
+            .clone()
+        }
+    }
+}
+
+impl Drop for WorkStealing2 {
+    //when is this called with respect to world?
+    //#[tracing::instrument(skip_all)]
+    fn drop(&mut self) {
+        // println!("dropping work stealing");
+        while let Some(thread) = self.threads.pop() {
+            if thread.thread().id() != std::thread::current().id() {
+                let _res = thread.join();
+            }
+        }
+        // println!("WorkStealing Scheduler Dropped");
+    }
+}
diff --git a/src/scheduler/work_stealing3.rs b/src/scheduler/work_stealing3.rs
new file mode 100644
index 00000000..8de0f992
--- /dev/null
+++ b/src/scheduler/work_stealing3.rs
@@ -0,0 +1,383 @@
+use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+use crate::MAIN_THREAD;
+
+//use tracing::*;
+
+use async_task::{Builder, Runnable};
+use core_affinity::CoreId;
+use futures_util::Future;
+use rand::prelude::*;
+use std::panic;
+use std::pin::Pin;
+use std::process;
+use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
+use std::sync::Arc;
+use std::task::Context;
+use std::task::Poll;
+//, Weak};
+use std::thread;
+
+use crossbeam::deque::{Injector, Stealer, Worker};
+use thread_local::ThreadLocal;
+
+static TASK_ID: AtomicUsize = AtomicUsize::new(0);
+
+lazy_static! {
+    static ref WORK_Q: ThreadLocal<Worker<Runnable<usize>>> = ThreadLocal::new();
+}
+
+#[derive(Debug)]
+pub(crate) struct WorkStealingThread {
+    imm_inj: Arc<Injector<Runnable<usize>>>,
+    work_inj: Arc<Injector<Runnable<usize>>>,
+    work_stealers: Vec<Stealer<Runnable<usize>>>,
+    // work_q: Arc<HashMap<ThreadId, Worker<Runnable<usize>>>>,
+    work_flag: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
+    panic: Arc<AtomicU8>,
+}
+
+impl WorkStealingThread {
+    //#[tracing::instrument(skip_all)]
+    fn run(
+        worker: WorkStealingThread,
+        work_q: Worker<Runnable<usize>>,
+        active_cnt: Arc<AtomicUsize>,
+        // num_tasks: Arc<AtomicUsize>,
+        id: CoreId,
+    ) -> thread::JoinHandle<()> {
+        let builder = thread::Builder::new().name("worker_thread".into());
+        builder
+            .spawn(move || {
+                // println!("TestSchdulerWorker thread running {:?} core: {:?}", std::thread::current().id(), id);
+                // let _span = trace_span!("WorkStealingThread::run");
+                core_affinity::set_for_current(id);
+                let work_q = WORK_Q.get_or(|| work_q);
+                active_cnt.fetch_add(1, Ordering::SeqCst);
+                let mut rng = rand::thread_rng();
+                let t = rand::distributions::Uniform::from(0..worker.work_stealers.len());
+                let mut timer = std::time::Instant::now();
+                while worker.panic.load(Ordering::SeqCst) == 0
+                    && (
+                        worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+                            || !(work_q.is_empty()
+                                && worker.work_inj.is_empty()
+                                && worker.imm_inj.is_empty())
+                        // || num_tasks.load(Ordering::SeqCst) > 1
+                    )
+                {
+                    let omsg = if !worker.imm_inj.is_empty() {
+                        worker.imm_inj.steal().success()
+                    } else {
+                        work_q.pop().or_else(|| {
+                            if worker.work_flag.compare_exchange(
+                                0,
+                                1,
+                                Ordering::SeqCst,
+                                Ordering::Relaxed,
+                            ) == Ok(0)
+                            {
+                                let ret = worker.work_inj.steal_batch_and_pop(work_q).success();
+                                worker.work_flag.store(0, Ordering::SeqCst);
+                                ret
+                            } else {
+                                let pe = t.sample(&mut rng);
+                                if worker.work_stealers[pe].len() > 100 {
+                                    worker.work_stealers[t.sample(&mut rng)].steal().success()
+                                } else {
+                                    None
+                                }
+                            }
+                        })
+                    };
+
+                    if let Some(runnable) = omsg {
+                        if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        {
+                            println!("runnable {:?}", runnable);
+                            println!(
+                                "work_q size {:?} work inj size {:?}", // num_tasks {:?}",
+                                work_q.len(),
+                                worker.work_inj.len(),
+                                // num_tasks.load(Ordering::SeqCst)
+                            );
+                            timer = std::time::Instant::now();
+                        }
+                        runnable.run();
+                    }
+                    if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        && (work_q.len() > 0 || worker.work_inj.len() > 0)
+                    {
+                        println!(
+                            "work_q size {:?} work inj size {:?} ", // num_tasks {:?}",
+                            work_q.len(),
+                            worker.work_inj.len(),
+                            // num_tasks.load(Ordering::SeqCst)
+                        );
+                        timer = std::time::Instant::now();
+                    }
+                    std::thread::yield_now();
+                }
+                active_cnt.fetch_sub(1, Ordering::SeqCst);
+                // println!("TestSchdulerWorker thread shutting down");
+            })
+            .unwrap()
+    }
+}
+
+#[derive(Debug)]
+pub(crate) struct WorkStealing3 {
+    max_num_threads: usize,
+    threads: Vec<thread::JoinHandle<()>>,
+    imm_inj: Arc<Injector<Runnable<usize>>>,
+    work_inj: Arc<Injector<Runnable<usize>>>,
+    work_stealers: Vec<Stealer<Runnable<usize>>>,
+    work_flag: Arc<AtomicU8>,
+    status: Arc<AtomicU8>,
+    active_cnt: Arc<AtomicUsize>,
+    panic: Arc<AtomicU8>,
+}
+
+impl LamellarExecutor for WorkStealing3 {
+    fn submit_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| {
+            // if thread::current().id() == *MAIN_THREAD {
+            work_inj.push(runnable);
+            // } else {
+            //     WORK_Q.get().unwrap().push(runnable);
+            // }
+        };
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.schedule();
+        task.detach();
+        // });
+    }
+
+    fn submit_io_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| {
+            if thread::current().id() == *MAIN_THREAD {
+                work_inj.push(runnable);
+            } else {
+                WORK_Q.get().unwrap().push(runnable);
+            }
+        };
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+        runnable.schedule();
+        task.detach();
+        // });
+    }
+
+    fn submit_immediate_task<F>(&self, task: F)
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let imm_inj = self.imm_inj.clone();
+        let schedule = move |runnable| imm_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.run(); //try to run immediately
+        task.detach();
+        // });
+    }
+
+    fn block_on<F: Future>(&self, fut: F) -> F::Output {
+        // trace_span!("block_on").in_scope(|| {
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, mut task) = unsafe {
+            Builder::new()
+                .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+                .spawn_unchecked(move |_task_id| async move { fut.await }, schedule)
+        };
+        let waker = runnable.waker();
+        runnable.run(); //try to run immediately
+        while !task.is_finished() {
+            self.exec_task(); //try to execute another task while this one is not ready
+        }
+        let cx = &mut Context::from_waker(&waker);
+        if let Poll::Ready(output) = Pin::new(&mut task).poll(cx) {
+            output
+        } else {
+            println!(
+                "[{:?}] work stealing block on failed --  task id{:?}",
+                std::thread::current().id(),
+                task.metadata()
+            );
+            panic!("task not ready");
+        }
+        // })
+    }
+
+    //#[tracing::instrument(skip_all)]
+    fn shutdown(&self) {
+        while self.panic.load(Ordering::SeqCst) == 0 && self.active_cnt.load(Ordering::Relaxed) > 0
+        {
+            //num active threads
+            self.exec_task();
+        }
+    }
+
+    //#[tracing::instrument(skip_all)]
+    fn force_shutdown(&self) {
+        // println!("work stealing shuting down {:?}", self.status());
+
+        // println!("work stealing shuting down {:?}",self.status());
+        let my_id = std::thread::current().id();
+        if self.threads.iter().any(|e| e.thread().id() == my_id) {
+            self.active_cnt.fetch_sub(1, Ordering::SeqCst); // I paniced so I wont actually decrement
+        } else {
+            while self.active_cnt.load(Ordering::Relaxed) > 0 {
+                //num active threads
+                self.exec_task();
+            }
+        }
+        // println!(
+        //     "work stealing shut down {:?} {:?} {:?}",
+        //     self.status(),
+        //     self.active_cnt.load(Ordering::Relaxed),
+        //     self.active_cnt.load(Ordering::Relaxed)
+        // );
+    }
+
+    //#[tracing::instrument(skip_all)]
+    fn exec_task(&self) {
+        let mut rng = rand::thread_rng();
+        let t = rand::distributions::Uniform::from(0..self.work_stealers.len());
+        let ret = if !self.imm_inj.is_empty() {
+            self.imm_inj.steal().success()
+        } else {
+            if self
+                .work_flag
+                .compare_exchange(0, 1, Ordering::SeqCst, Ordering::Relaxed)
+                == Ok(0)
+            {
+                let ret = self.work_inj.steal().success();
+                self.work_flag.store(0, Ordering::SeqCst);
+                ret
+            } else {
+                // self.work_stealers[t.sample(&mut rng)].steal().success()
+                None
+            }
+        };
+        if let Some(runnable) = ret {
+            runnable.run();
+        } else {
+            std::thread::yield_now();
+        }
+    }
+
+    fn set_max_workers(&mut self, num_workers: usize) {
+        self.max_num_threads = num_workers;
+    }
+
+    fn num_workers(&self) -> usize {
+        self.max_num_threads
+    }
+}
+
+impl WorkStealing3 {
+    pub(crate) fn new(
+        num_workers: usize,
+        status: Arc<AtomicU8>,
+        panic: Arc<AtomicU8>,
+    ) -> WorkStealing3 {
+        // println!("new work stealing queue");
+        let mut ws = WorkStealing3 {
+            max_num_threads: num_workers,
+            threads: Vec::new(),
+            imm_inj: Arc::new(Injector::new()),
+            work_inj: Arc::new(Injector::new()),
+            work_stealers: Vec::new(),
+            work_flag: Arc::new(AtomicU8::new(0)),
+            status: status,
+            active_cnt: Arc::new(AtomicUsize::new(0)),
+            panic: panic,
+        };
+        ws.init();
+        ws
+    }
+    // #[tracing::instrument(skip_all)]
+    fn init(&mut self) {
+        let mut work_workers: std::vec::Vec<crossbeam::deque::Worker<Runnable<usize>>> = vec![];
+        for _i in 0..self.max_num_threads {
+            let work_worker: crossbeam::deque::Worker<Runnable<usize>> =
+                crossbeam::deque::Worker::new_fifo();
+            self.work_stealers.push(work_worker.stealer());
+            work_workers.push(work_worker);
+        }
+
+        let orig_hook = panic::take_hook();
+        panic::set_hook(Box::new(move |panic_info| {
+            // invoke the default handler and exit the process
+            orig_hook(panic_info);
+            process::exit(1);
+        }));
+        let core_ids = match core_affinity::get_core_ids() {
+            Some(core_ids) => core_ids,
+            None => {
+                vec![core_affinity::CoreId { id: 0 }]
+            }
+        };
+        // println!("core_ids: {:?}",core_ids);
+        println!("num threads: {} {}", self.max_num_threads, core_ids.len());
+        for i in 0..self.max_num_threads {
+            let work_worker = work_workers.pop().unwrap();
+            let worker = WorkStealingThread {
+                imm_inj: self.imm_inj.clone(),
+                work_inj: self.work_inj.clone(),
+                work_stealers: self.work_stealers.clone(),
+                work_flag: self.work_flag.clone(),
+                status: self.status.clone(),
+                panic: self.panic.clone(),
+            };
+            self.threads.push(WorkStealingThread::run(
+                worker,
+                work_worker,
+                self.active_cnt.clone(),
+                // self.num_tasks.clone(),
+                core_ids[i % core_ids.len()],
+            ));
+        }
+        while self.active_cnt.load(Ordering::SeqCst) != self.threads.len() {
+            std::thread::yield_now();
+        }
+    }
+}
+
+impl Drop for WorkStealing3 {
+    //when is this called with respect to world?
+    //#[tracing::instrument(skip_all)]
+    fn drop(&mut self) {
+        // println!("dropping work stealing");
+        while let Some(thread) = self.threads.pop() {
+            if thread.thread().id() != std::thread::current().id() {
+                let _res = thread.join();
+            }
+        }
+        // println!("WorkStealing Scheduler Dropped");
+    }
+}

From f84c082f2df32e0af4b8a59c02acc1488490f509 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 4 Apr 2024 17:09:47 -0700
Subject: [PATCH 018/116] refactoring environment variables

---
 Cargo.toml                          |   1 +
 impl/src/gen_am_group.rs            |   9 +-
 src/array/operations.rs             | 137 +++++++++++++++++-----------
 src/array/unsafe.rs                 |   5 +-
 src/array/unsafe/operations.rs      |  54 ++++++-----
 src/barrier.rs                      |  20 ++--
 src/darc.rs                         |  15 +--
 src/lamellae.rs                     |  40 +++++---
 src/lamellae/command_queues.rs      |  19 ++--
 src/lamellae/rofi/rofi_comm.rs      |  15 ++-
 src/lamellae/rofi_lamellae.rs       |   8 +-
 src/lamellae/shmem/shmem_comm.rs    |  14 ++-
 src/lamellae/shmem_lamellae.rs      |   9 +-
 src/lamellar_alloc.rs               |   6 +-
 src/lamellar_task_group.rs          |   5 +-
 src/lamellar_team.rs                |  11 ++-
 src/lamellar_world.rs               | 113 +++++++++++++----------
 src/lib.rs                          |  10 +-
 src/scheduler.rs                    |  53 +++++++----
 src/scheduler/async_std_executor.rs |   2 +-
 src/scheduler/tokio_executor.rs     |   4 +-
 src/scheduler/work_stealing.rs      |   7 +-
 src/scheduler/work_stealing2.rs     |  14 +--
 src/scheduler/work_stealing3.rs     |   7 +-
 24 files changed, 347 insertions(+), 231 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 6af7798b..fa297f34 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,7 @@ thread_local = "1.1.4"
 tokio = { version = "1.35.1", features = ["full"] , optional = true}
 libc = { version = "0.2.137", optional = true }
 async-global-executor = "2.4.1"
+envy = "0.4.2"
 
 
 
diff --git a/impl/src/gen_am_group.rs b/impl/src/gen_am_group.rs
index c57ae4ad..d2bf9bf4 100644
--- a/impl/src/gen_am_group.rs
+++ b/impl/src/gen_am_group.rs
@@ -317,10 +317,11 @@ fn impl_am_group_user(
     quote! {
         impl #am_user_impl_generics #am_group_name_user #am_user_ty_generics #am_user_where_clause{
             pub fn new(team: std::sync::Arc<#lamellar::LamellarTeam>) -> Self {
-                let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-                    Ok(n) => n.parse::<usize>().unwrap(),
-                    Err(_) => 10000,
-                };
+                let num_per_batch = #lamellar::config().batch_op_size;
+                // match std::env::var("LAMELLAR_OP_BATCH") {
+                //     Ok(n) => n.parse::<usize>().unwrap(),
+                //     Err(_) => 10000,
+                // };
                 #am_group_name_user {
                     team: team,
                     batch_cnt: 0,
diff --git a/src/array/operations.rs b/src/array/operations.rs
index 3577198f..a9fd71ca 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -5,6 +5,7 @@ use crate::array::global_lock_atomic::*;
 use crate::array::local_lock_atomic::*;
 use crate::array::native_atomic::*;
 use crate::array::{AmDist, Dist, LamellarEnv, LamellarWriteArray};
+use crate::config;
 // use crate::lamellar_request::LamellarRequest;
 // use crate::scheduler::Scheduler;
 // use crate::LamellarTeamRT;
@@ -452,13 +453,17 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a [T] {
         let num = if len < 1000 {
             1
         } else {
-            match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                Ok(n) => n.parse::<usize>().unwrap(),
-                Err(_) => match std::env::var("LAMELLAR_THREADS") {
-                    Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-                    Err(_) => 4,
-                },
+            match config().batch_op_threads {
+                Some(n) => n,
+                None => std::cmp::max(1, config().threads / 4),
             }
+            // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
+            //     Ok(n) => n.parse::<usize>().unwrap(),
+            //     Err(_) => match std::env::var("LAMELLAR_THREADS") {
+            //         Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
+            //         Err(_) => 4,
+            //     },
+            // }
         };
         let num_per_batch = len / num;
         for i in 0..num {
@@ -509,15 +514,19 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a mut [T] {
         let num = if len < 1000 {
             1
         } else {
-            match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                Ok(n) => n.parse::<usize>().unwrap(),
-                Err(_) => {
-                    match std::env::var("LAMELLAR_THREADS") {
-                        Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-                        Err(_) => 4, //+ 1 to account for main thread
-                    }
-                }
+            match config().batch_op_threads {
+                Some(n) => n,
+                None => std::cmp::max(1, config().threads / 4),
             }
+            // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
+            //     Ok(n) => n.parse::<usize>().unwrap(),
+            //     Err(_) => {
+            //         match std::env::var("LAMELLAR_THREADS") {
+            //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
+            //             Err(_) => 4, //+ 1 to account for main thread
+            //         }
+            //     }
+            // }
         };
         let num_per_batch = len / num;
         for i in 0..num {
@@ -584,15 +593,19 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
         let num = if len < 1000 {
             1
         } else {
-            match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                Ok(n) => n.parse::<usize>().unwrap(),
-                Err(_) => {
-                    match std::env::var("LAMELLAR_THREADS") {
-                        Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-                        Err(_) => 4, //+ 1 to account for main thread
-                    }
-                }
+            match config().batch_op_threads {
+                Some(n) => n,
+                None => std::cmp::max(1, config().threads / 4),
             }
+            // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
+            //     Ok(n) => n.parse::<usize>().unwrap(),
+            //     Err(_) => {
+            //         match std::env::var("LAMELLAR_THREADS") {
+            //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
+            //             Err(_) => 4, //+ 1 to account for main thread
+            //         }
+            //     }
+            // }
         };
         let num_per_batch = len / num;
         let iters = self
@@ -706,15 +719,19 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
             let num = if len < 1000 {
                 1
             } else {
-                match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                    Ok(n) => n.parse::<usize>().unwrap(),
-                    Err(_) => {
-                        match std::env::var("LAMELLAR_THREADS") {
-                            Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4), //+ 1 to account for main thread
-                            Err(_) => 4, //+ 1 to account for main thread
-                        }
-                    }
+                match config().batch_op_threads {
+                    Some(n) => n,
+                    None => std::cmp::max(1, config().threads / 4),
                 }
+                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
+                //     Ok(n) => n.parse::<usize>().unwrap(),
+                //     Err(_) => {
+                //         match std::env::var("LAMELLAR_THREADS") {
+                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4), //+ 1 to account for main thread
+                //             Err(_) => 4, //+ 1 to account for main thread
+                //         }
+                //     }
+                // }
             };
             let num_per_batch = len / num;
             // println!("num: {} len {:?} npb {:?}", num, len, num_per_batch);
@@ -748,15 +765,19 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
             let num = if len < 1000 {
                 1
             } else {
-                match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                    Ok(n) => n.parse::<usize>().unwrap(),
-                    Err(_) => {
-                        match std::env::var("LAMELLAR_THREADS") {
-                            Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4), //+ 1 to account for main thread
-                            Err(_) => 4, //+ 1 to account for main thread
-                        }
-                    }
+                match config().batch_op_threads {
+                    Some(n) => n,
+                    None => std::cmp::max(1, config().threads / 4),
                 }
+                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
+                //     Ok(n) => n.parse::<usize>().unwrap(),
+                //     Err(_) => {
+                //         match std::env::var("LAMELLAR_THREADS") {
+                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4), //+ 1 to account for main thread
+                //             Err(_) => 4, //+ 1 to account for main thread
+                //         }
+                //     }
+                // }
             };
             let num_per_batch = len / num;
             // println!("num: {} len {:?} npb {:?}", num, len, num_per_batch);
@@ -824,15 +845,19 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for &GenericAtomicLocalData<T> {
             let num = if len < 1000 {
                 1
             } else {
-                match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                    Ok(n) => n.parse::<usize>().unwrap(),
-                    Err(_) => {
-                        match std::env::var("LAMELLAR_THREADS") {
-                            Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-                            Err(_) => 4, //+ 1 to account for main thread
-                        }
-                    }
+                match config().batch_op_threads {
+                    Some(n) => n,
+                    None => std::cmp::max(1, config().threads / 4),
                 }
+                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
+                //     Ok(n) => n.parse::<usize>().unwrap(),
+                //     Err(_) => {
+                //         match std::env::var("LAMELLAR_THREADS") {
+                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
+                //             Err(_) => 4, //+ 1 to account for main thread
+                //         }
+                //     }
+                // }
             };
             let num_per_batch = len / num;
             for i in 0..num {
@@ -871,15 +896,19 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for &NativeAtomicLocalData<T> {
             let num = if len < 1000 {
                 1
             } else {
-                match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                    Ok(n) => n.parse::<usize>().unwrap(),
-                    Err(_) => {
-                        match std::env::var("LAMELLAR_THREADS") {
-                            Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-                            Err(_) => 4, //+ 1 to account for main thread
-                        }
-                    }
+                match config().batch_op_threads {
+                    Some(n) => n,
+                    None => std::cmp::max(1, config().threads / 4),
                 }
+                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
+                //     Ok(n) => n.parse::<usize>().unwrap(),
+                //     Err(_) => {
+                //         match std::env::var("LAMELLAR_THREADS") {
+                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
+                //             Err(_) => 4, //+ 1 to account for main thread
+                //         }
+                //     }
+                // }
             };
             let num_per_batch = len / num;
             // println!("num: {} len {:?} npb {:?}", num, len, num_per_batch);
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 7c86d038..30ef0d8d 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -9,6 +9,7 @@ use crate::active_messaging::*;
 use crate::array::*;
 use crate::array::{LamellarRead, LamellarWrite};
 use crate::darc::{Darc, DarcMode, WeakDarc};
+use crate::env_var::config;
 use crate::lamellae::AllocationType;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::{Dist, MemoryRegion};
@@ -452,7 +453,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
             // self.inner.data.team.flush();
             // self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait
             async_std::task::yield_now().await;
-            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 //|| first{
                 println!(
                     "in array await_all mype: {:?} cnt: {:?} {:?} {:?}",
@@ -946,7 +947,7 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
             // std::thread::yield_now();
             // self.inner.data.team.flush();
             self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait
-            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 //|| first{
                 println!(
                     "in array wait_all mype: {:?} cnt: {:?} {:?} {:?}",
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 842165dd..b7ecd855 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -2,6 +2,7 @@ use crate::active_messaging::LamellarArcAm;
 use crate::array::operations::*;
 use crate::array::r#unsafe::UnsafeArray;
 use crate::array::{AmDist, Dist, LamellarArray, LamellarByteArray, LamellarEnv};
+use crate::env_var::{config, IndexType};
 use futures_util::Future;
 use parking_lot::Mutex;
 use std::any::TypeId;
@@ -54,16 +55,21 @@ enum IndexSize {
 
 impl From<usize> for IndexSize {
     fn from(size: usize) -> Self {
-        if size <= u8::MAX as usize {
-            IndexSize::U8
-        } else if size <= u16::MAX as usize {
-            IndexSize::U16
-        } else if size <= u32::MAX as usize {
-            IndexSize::U32
-        } else if size <= u64::MAX as usize {
-            IndexSize::U64
-        } else {
-            IndexSize::Usize
+        match config().index_size {
+            IndexType::Dynamic => {
+                if size <= u8::MAX as usize {
+                    IndexSize::U8
+                } else if size <= u16::MAX as usize {
+                    IndexSize::U16
+                } else if size <= u32::MAX as usize {
+                    IndexSize::U32
+                } else if size <= u64::MAX as usize {
+                    IndexSize::U64
+                } else {
+                    IndexSize::Usize
+                }
+            }
+            IndexType::Static => IndexSize::Usize,
         }
     }
 }
@@ -160,6 +166,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             .max()
             .unwrap();
         let index_size = IndexSize::from(max_local_size);
+        println!("index_size: {:?}", index_size);
         let data_copied = Arc::new(AtomicBool::new(false));
         let res: Pin<Box<dyn Future<Output = Vec<((), Vec<usize>)>> + Send>> =
             if v_len == 1 && i_len == 1 {
@@ -387,10 +394,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index_size: IndexSize,
         data_copied: Arc<AtomicBool>,
     ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
-        let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-            Ok(n) => n.parse::<usize>().unwrap(),
-            Err(_) => 10000,
-        };
+        let num_per_batch = config().batch_op_size;
+        // let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
+        //     Ok(n) => n.parse::<usize>().unwrap(),
+        //     Err(_) => 10000,
+        // };
         let num_pes = self.inner.data.team.num_pes();
         let cnt = Arc::new(AtomicUsize::new(0));
         let futures = Arc::new(Mutex::new(Vec::new()));
@@ -517,10 +525,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         _index_size: IndexSize,
         data_copied: Arc<AtomicBool>,
     ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
-        let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-            Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
-            Err(_) => 10000,                      //+ 1 to account for main thread
-        };
+        let num_per_batch = config().batch_op_size;
+        // let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
+        //     Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
+        //     Err(_) => 10000,                      //+ 1 to account for main thread
+        // };
         // println!("multi_val_one_index");
         // let num_pes = self.inner.data.team.num_pes();
         let cnt = Arc::new(AtomicUsize::new(0));
@@ -604,10 +613,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index_size: IndexSize,
         data_copied: Arc<AtomicBool>,
     ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
-        let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-            Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
-            Err(_) => 10000,                      //+ 1 to account for main thread
-        };
+        let num_per_batch = config().batch_op_size;
+        // let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
+        //     Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
+        //     Err(_) => 10000,                      //+ 1 to account for main thread
+        // };
         let bytes_per_batch = match index_size {
             IndexSize::U8 => num_per_batch * std::mem::size_of::<IdxVal<u8, T>>(),
             IndexSize::U16 => num_per_batch * std::mem::size_of::<IdxVal<u16, T>>(),
diff --git a/src/barrier.rs b/src/barrier.rs
index d9f3d7a9..cd11aef5 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -1,13 +1,13 @@
+use crate::env_var::config;
 use crate::lamellae::{AllocationType, Lamellae, LamellaeRDMA};
 use crate::lamellar_arch::LamellarArchRT;
 use crate::memregion::MemoryRegion;
 use crate::scheduler::Scheduler;
+
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::time::Instant;
 
-const DISSEMINATION_FACTOR: usize = 2;
-
 pub(crate) struct Barrier {
     my_pe: usize, // global pe id
     num_pes: usize,
@@ -32,10 +32,8 @@ impl Barrier {
         panic: Arc<AtomicU8>,
     ) -> Barrier {
         let num_pes = arch.num_pes;
-        let mut n = std::env::var("LAMELLAR_BARRIER_DISSEMNATION_FACTOR")
-            .unwrap_or(DISSEMINATION_FACTOR.to_string())
-            .parse::<usize>()
-            .unwrap();
+        // let mut n = std::env::var("LAMELLAR_BARRIER_DISSEMNATION_FACTOR")
+        let mut n = config().barrier_dissemination_factor;
         let num_rounds = if n > 1 && num_pes > 2 {
             ((num_pes as f64).log2() / (n as f64).log2()).ceil() as usize
         } else {
@@ -127,7 +125,7 @@ impl Barrier {
         recv_pe: usize,
         send_buf_slice: &[usize],
     ) {
-        if s.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+        if s.elapsed().as_secs_f64() > config().deadlock_timeout {
             println!("[LAMELLAR WARNING][{:?}] Potential deadlock detected.\n\
             Barrier is a collective operation requiring all PEs associated with the distributed object to enter the barrier call.\n\
             Please refer to https://docs.rs/lamellar/latest/lamellar/index.html?search=barrier for more information\n\
@@ -138,7 +136,7 @@ impl Barrier {
             To view backtrace set RUST_LIB_BACKTRACE=1\n\
         {}",
         std::thread::current().id()
-        ,*crate::DEADLOCK_TIMEOUT,std::backtrace::Backtrace::capture());
+        ,config().deadlock_timeout,std::backtrace::Backtrace::capture());
 
             println!(
                 "[{:?}][{:?}, {:?}] round: {:?} i: {:?} teamsend_pe: {:?} team_recv_pe: {:?} recv_pe: {:?} id: {:?} buf {:?}",
@@ -274,8 +272,10 @@ impl Barrier {
                 self.scheduler.exec_task();
             });
         } else {
-            if let Ok(val) = std::env::var("LAMELLAR_BARRIER_WARNING") {
-                if val != "0" && val != "false" && val != "no" && val != "off" {
+            if let Some(val) = config().barrier_warning {
+                // std::env::var("LAMELLAR_BARRIER_WARNING") {
+                // if val != "0" && val != "false" && val != "no" && val != "off" {
+                if val {
                     println!("[LAMELLAR WARNING] You are calling barrier from within an async context, this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BARRIER_WARNING=0 to disable this warning");
                 }
             } else {
diff --git a/src/darc.rs b/src/darc.rs
index d5ad86c4..2e681b0a 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -61,6 +61,7 @@ use std::sync::Arc;
 
 use crate::active_messaging::{AMCounters, AmHandle, RemotePtr};
 use crate::barrier::Barrier;
+use crate::env_var::config;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
@@ -625,7 +626,7 @@ impl<T> DarcInner<T> {
                     if inner.local_cnt.load(Ordering::SeqCst) == 1 + extra_cnt {
                         join_all(inner.send_finished()).await;
                     }
-                    if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                    if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                         let ref_cnts_slice = unsafe {
                             std::slice::from_raw_parts_mut(
                                 inner.ref_cnt_addr as *mut usize,
@@ -646,7 +647,7 @@ impl<T> DarcInner<T> {
                             },
                             inner.local_cnt.load(Ordering::SeqCst),
                             inner.dist_cnt.load(Ordering::SeqCst),
-                            *crate::DEADLOCK_TIMEOUT,
+                            config().deadlock_timeout,
                             std::backtrace::Backtrace::capture()
                         );
                         timer = std::time::Instant::now();
@@ -673,7 +674,7 @@ impl<T> DarcInner<T> {
     //     while am_counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
     //         // std::thread::yield_now();
     //         team.scheduler.exec_task(); //mmight as well do useful work while we wait
-    //         if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+    //         if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
     //             //|| first{
     //             // println!(
     //             //     "[{:?}] in darc wait_all mype: {:?} cnt: {:?} {:?}",
@@ -1407,7 +1408,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                     join_all(wrapped.send_finished()).await;
                 }
 
-                if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                if timeout.elapsed().as_secs_f64() > config().deadlock_timeout {
                     let ref_cnts_slice = unsafe {
                         std::slice::from_raw_parts_mut(
                             wrapped.ref_cnt_addr as *mut usize,
@@ -1425,7 +1426,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                         mode_refs,
                         wrapped.local_cnt.load(Ordering::SeqCst),
                         wrapped.dist_cnt.load(Ordering::SeqCst),
-                        *crate::DEADLOCK_TIMEOUT,
+                        config().deadlock_timeout,
                         std::backtrace::Backtrace::capture()
                     );
                     timeout = std::time::Instant::now();
@@ -1448,7 +1449,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                     // wrapped.send_finished()
                     join_all(wrapped.send_finished()).await;
                 }
-                if timeout.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                if timeout.elapsed().as_secs_f64() > config().deadlock_timeout {
                     let ref_cnts_slice = std::slice::from_raw_parts_mut(
                         wrapped.ref_cnt_addr as *mut usize,
                         wrapped.num_pes,
@@ -1464,7 +1465,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                         mode_refs,
                         wrapped.local_cnt.load(Ordering::SeqCst),
                         wrapped.dist_cnt.load(Ordering::SeqCst),
-                        *crate::DEADLOCK_TIMEOUT,
+                        config().deadlock_timeout,
                         std::backtrace::Backtrace::capture()
                     );
                     timeout = std::time::Instant::now();
diff --git a/src/lamellae.rs b/src/lamellae.rs
index 549af3ee..69ce6241 100755
--- a/src/lamellae.rs
+++ b/src/lamellae.rs
@@ -1,4 +1,5 @@
 use crate::active_messaging::Msg;
+use crate::config;
 use crate::lamellar_arch::LamellarArchRT;
 use crate::scheduler::Scheduler;
 use std::sync::Arc;
@@ -56,12 +57,7 @@ pub(crate) enum AllocationType {
 
 impl Default for Backend {
     fn default() -> Self {
-        default_backend()
-    }
-}
-fn default_backend() -> Backend {
-    match std::env::var("LAMELLAE_BACKEND") {
-        Ok(p) => match p.as_str() {
+        match config().backend.as_str() {
             "rofi" => {
                 #[cfg(feature = "enable-rofi")]
                 return Backend::Rofi;
@@ -74,15 +70,33 @@ fn default_backend() -> Backend {
             _ => {
                 return Backend::Local;
             }
-        },
-        Err(_) => {
-            #[cfg(feature = "enable-rofi")]
-            return Backend::Rofi;
-            #[cfg(not(feature = "enable-rofi"))]
-            return Backend::Local;
         }
-    };
+    }
 }
+// fn default_backend() -> Backend {
+//     match std::env::var("LAMELLAE_BACKEND") {
+//         Ok(p) => match p.as_str() {
+//             "rofi" => {
+//                 #[cfg(feature = "enable-rofi")]
+//                 return Backend::Rofi;
+//                 #[cfg(not(feature = "enable-rofi"))]
+//                 panic!("unable to set rofi backend, recompile with 'enable-rofi' feature")
+//             }
+//             "shmem" => {
+//                 return Backend::Shmem;
+//             }
+//             _ => {
+//                 return Backend::Local;
+//             }
+//         },
+//         Err(_) => {
+//             #[cfg(feature = "enable-rofi")]
+//             return Backend::Rofi;
+//             #[cfg(not(feature = "enable-rofi"))]
+//             return Backend::Local;
+//         }
+//     };
+// }
 
 #[derive(serde::Serialize, serde::Deserialize, Clone, Debug, Default)]
 pub(crate) struct SerializeHeader {
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index acd1f3c6..52a294e8 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -1,3 +1,4 @@
+use crate::env_var::config;
 use crate::lamellae::comm::*;
 use crate::lamellae::{
     Des, Lamellae, LamellaeComm, LamellaeRDMA, SerializedData, SerializedDataOps,
@@ -5,7 +6,7 @@ use crate::lamellae::{
 use crate::scheduler::Scheduler;
 
 use parking_lot::Mutex;
-use thread_local::ThreadLocal;
+// use thread_local::ThreadLocal;
 
 use std::collections::HashMap;
 use std::num::Wrapping;
@@ -727,7 +728,7 @@ impl InnerCQ {
                 //while we are waiting to push our data might as well try to advance the buffers
                 self.progress_transfers(dst, &mut cmd_buffer);
                 self.try_sending_buffer(dst, &mut cmd_buffer);
-                if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                     let send_buf = self.send_buffer.lock();
                     println!("waiting to add cmd to cmd buffer {:?}", cmd_buffer);
                     println!("send_buf: {:?}", send_buf);
@@ -772,7 +773,7 @@ impl InnerCQ {
                         break;
                     }
                 }
-                if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                     println!("waiting to send cmd buffer {:?}", cmd_buffer);
                     let send_buf = self.send_buffer.lock();
                     println!("send_buf addr {:?}", send_buf.as_ptr());
@@ -838,7 +839,7 @@ impl InnerCQ {
                 while !alloc_buf[pe].check_hash() || alloc_buf[pe].cmd != Cmd::Alloc {
                     self.comm.flush();
                     std::thread::yield_now();
-                    if start.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                    if start.elapsed().as_secs_f64() > config().deadlock_timeout {
                         println!("waiting to alloc: {:?} {:?}", alloc_buf, alloc_id);
                         start = std::time::Instant::now();
                     }
@@ -942,7 +943,7 @@ impl InnerCQ {
                     // self.put_amt.fetch_add(send_buf[dst].as_bytes().len(),Ordering::Relaxed);
                     break;
                 }
-                if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                     let send_buf = self.send_buffer.lock();
                     // println!("waiting to add cmd to cmd buffer {:?}",cmd_buffer);
                     println!("send_buf: {:?}", send_buf);
@@ -982,7 +983,7 @@ impl InnerCQ {
             && self.active.load(Ordering::SeqCst) != CmdQStatus::Panic as u8
         {
             async_std::task::yield_now().await;
-            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "stuck waiting for data from {:?}!!! {:?} {:?} {:?} {:?} -- calced hash {:?}",
                     src,
@@ -1010,7 +1011,7 @@ impl InnerCQ {
             && self.active.load(Ordering::SeqCst) != CmdQStatus::Panic as u8
         {
             async_std::task::yield_now().await;
-            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "stuck waiting for serialized data from {:?} !!! {:?} {:?} {:?} {:?}",
                     src,
@@ -1036,7 +1037,7 @@ impl InnerCQ {
             self.send_alloc(cmd.dsize);
             ser_data = self.comm.new_serialized_data(cmd.dsize as usize);
             // println!("cq 851 data {:?}",ser_data.is_ok());
-            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT && ser_data.is_err() {
+            if timer.elapsed().as_secs_f64() > config().deadlock_timeout && ser_data.is_err() {
                 println!(
                     "get cmd stuck waiting for alloc {:?} {:?}",
                     cmd.dsize,
@@ -1082,7 +1083,7 @@ impl InnerCQ {
                 .comm
                 .rt_alloc(cmd.dsize as usize, std::mem::align_of::<u8>());
             // println!("cq 874 data {:?}",data.is_ok());
-            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!("get cmd buf stuck waiting for alloc");
                 timer = std::time::Instant::now();
             }
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index 241531e6..ab2fc34c 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -1,3 +1,4 @@
+use crate::config;
 use crate::lamellae::comm::*;
 use crate::lamellae::command_queues::CommandQueue;
 use crate::lamellae::rofi::rofi_api::*;
@@ -29,7 +30,7 @@ static ROFI_MAGIC_4: u32 = 0b10001111100100110010011111001100;
 static ROFI_MAGIC_2: u16 = 0b1100100110010011;
 static ROFI_MAGIC_1: u8 = 0b10011001;
 
-static ROFI_MEM: AtomicUsize = AtomicUsize::new(4 * 1024 * 1024 * 1024);
+pub(crate) static ROFI_MEM: AtomicUsize = AtomicUsize::new(4 * 1024 * 1024 * 1024);
 const RT_MEM: usize = 100 * 1024 * 1024; // we add this space for things like team barrier buffers, but will work towards having teams get memory from rofi allocs
 #[derive(Debug)]
 pub(crate) struct RofiComm {
@@ -51,10 +52,11 @@ pub(crate) struct RofiComm {
 impl RofiComm {
     //#[tracing::instrument(skip_all)]
     pub(crate) fn new(provider: &str) -> RofiComm {
-        if let Ok(size) = std::env::var("LAMELLAR_MEM_SIZE") {
-            let size = size
-                .parse::<usize>()
-                .expect("invalid memory size, please supply size in bytes");
+        if let Some(size) = config().heap_size {
+            // if let Ok(size) = std::env::var("LAMELLAR_MEM_SIZE") {
+            // let size = size
+            //     .parse::<usize>()
+            //     .expect("invalid memory size, please supply size in bytes");
             ROFI_MEM.store(size, Ordering::SeqCst);
         }
         rofi_init(provider).expect("error in rofi init");
@@ -192,6 +194,9 @@ impl RofiComm {
             Ok(_) => {}
         }
     }
+    pub(crate) fn heap_size() -> usize {
+        ROFI_MEM.load(Ordering::SeqCst)
+    }
 }
 
 impl CommOps for RofiComm {
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index fdd33ec5..5f0bd012 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -1,3 +1,4 @@
+use crate::env_var::{config, HeapMode};
 use crate::lamellae::comm::{AllocResult, CmdQStatus, CommOps};
 use crate::lamellae::command_queues::CommandQueue;
 use crate::lamellae::rofi::rofi_comm::{RofiComm, RofiData};
@@ -273,6 +274,11 @@ impl LamellaeRDMA for Rofi {
     }
     fn alloc_pool(&self, min_size: usize) {
         // println!("trying to alloc pool {:?}",min_size);
-        self.cq.send_alloc(min_size);
+        match config().heap_mode {
+            HeapMode::Static => {
+                panic!("[LAMELLAR ERROR] Heap out of memory, current heap size is {} bytes,set LAMELLAR_HEAP_SIZE envrionment variable to increase size, or set LAMELLAR_HEAP_MODE=dynamic to enable exprimental growable heaps",RofiComm::heap_size())
+            }
+            HeapMode::Dynamic => self.cq.send_alloc(min_size),
+        }
     }
 }
diff --git a/src/lamellae/shmem/shmem_comm.rs b/src/lamellae/shmem/shmem_comm.rs
index badd596a..0831fe72 100644
--- a/src/lamellae/shmem/shmem_comm.rs
+++ b/src/lamellae/shmem/shmem_comm.rs
@@ -1,3 +1,4 @@
+use crate::config;
 use crate::lamellae::comm::*;
 use crate::lamellae::command_queues::CommandQueue;
 use crate::lamellae::{
@@ -311,10 +312,11 @@ impl ShmemComm {
             Ok(val) => val.parse::<usize>().unwrap(),
             Err(_e) => 0,
         };
-        if let Ok(size) = std::env::var("LAMELLAR_MEM_SIZE") {
-            let size = size
-                .parse::<usize>()
-                .expect("invalid memory size, please supply size in bytes");
+        if let Some(size) = config().heap_size {
+            //std::env::var("LAMELLAR_MEM_SIZE") {
+            // let size = size
+            //     .parse::<usize>()
+            //     .expect("invalid memory size, please supply size in bytes");
             SHMEM_SIZE.store(size, Ordering::SeqCst);
         }
 
@@ -352,6 +354,10 @@ impl ShmemComm {
         shmem.alloc.write()[0].init(addr, mem_per_pe);
         shmem
     }
+
+    pub(crate) fn heap_size() -> usize {
+        SHMEM_SIZE.load(Ordering::SeqCst)
+    }
 }
 
 impl CommOps for ShmemComm {
diff --git a/src/lamellae/shmem_lamellae.rs b/src/lamellae/shmem_lamellae.rs
index fdfb62ff..c5ae396e 100644
--- a/src/lamellae/shmem_lamellae.rs
+++ b/src/lamellae/shmem_lamellae.rs
@@ -1,3 +1,5 @@
+use crate::config;
+use crate::env_var::HeapMode;
 use crate::lamellae::comm::{AllocResult, CmdQStatus, CommOps};
 use crate::lamellae::command_queues::CommandQueue;
 use crate::lamellae::shmem::shmem_comm::*;
@@ -261,6 +263,11 @@ impl LamellaeRDMA for Shmem {
         self.shmem_comm.num_pool_allocs()
     }
     fn alloc_pool(&self, min_size: usize) {
-        self.cq.send_alloc(min_size);
+        match config().heap_mode {
+            HeapMode::Static => {
+                panic!("[LAMELLAR ERROR] Heap out of memory, current heap size is {} bytes, set LAMELLAR_HEAP_SIZE envrionment variable to increase size, or set LAMELLAR_HEAP_MODE=dynamic to enable exprimental growable heaps",ShmemComm::heap_size())
+            }
+            HeapMode::Dynamic => self.cq.send_alloc(min_size),
+        }
     }
 }
diff --git a/src/lamellar_alloc.rs b/src/lamellar_alloc.rs
index b093ec3c..1a6471ab 100644
--- a/src/lamellar_alloc.rs
+++ b/src/lamellar_alloc.rs
@@ -1,3 +1,5 @@
+use crate::env_var::config;
+
 use core::marker::PhantomData;
 use indexmap::IndexSet;
 // use log::trace;
@@ -274,11 +276,11 @@ impl LamellarAlloc for BTreeAlloc {
         let mut timer = std::time::Instant::now();
         while let None = val {
             val = self.try_malloc(size, align);
-            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!("[WARNING]  Potential deadlock detected when trying to allocate more memory.\n\
                 The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
                 To view backtrace set RUST_LIB_BACKTRACE=1\n\
-                {}",*crate::DEADLOCK_TIMEOUT,std::backtrace::Backtrace::capture());
+                {}",config().deadlock_timeout,std::backtrace::Backtrace::capture());
                 timer = std::time::Instant::now();
             }
         }
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index b718cabb..0218d914 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -1,5 +1,6 @@
 use crate::active_messaging::registered_active_message::{AmId, AMS_EXECS, AMS_IDS, AM_ID_START};
 use crate::active_messaging::*;
+use crate::env_var::config;
 use crate::lamellae::Des;
 use crate::lamellar_arch::LamellarArchRT;
 use crate::lamellar_request::LamellarRequest;
@@ -645,7 +646,7 @@ impl LamellarTaskGroup {
         while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
             // self.team.flush();
             self.team.scheduler.exec_task();
-            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "in task group wait_all mype: {:?} cnt: {:?} {:?}",
                     self.team.world_pe,
@@ -666,7 +667,7 @@ impl LamellarTaskGroup {
             // self.team.flush();
             // self.team.scheduler.exec_task();
             async_std::task::yield_now().await;
-            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "in task group wait_all mype: {:?} cnt: {:?} {:?}",
                     self.team.world_pe,
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 145ba091..4195f134 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1,6 +1,7 @@
 use crate::active_messaging::handle::AmHandleInner;
 use crate::active_messaging::*;
 use crate::barrier::Barrier;
+use crate::env_var::config;
 use crate::lamellae::{AllocationType, Lamellae, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_arch::{GlobalArch, IdError, LamellarArch, LamellarArchEnum, LamellarArchRT};
 use crate::lamellar_env::LamellarEnv;
@@ -1150,7 +1151,7 @@ impl LamellarTeamRT {
                 while *hash_val == 0 {
                     self.flush();
                     std::thread::yield_now();
-                    if s.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                    if s.elapsed().as_secs_f64() > config().deadlock_timeout {
                         let status = hash_buf
                             .as_slice()
                             .expect("data should exist on pe")
@@ -1163,7 +1164,7 @@ impl LamellarTeamRT {
                         The following indicates which PEs have not entered the call: {:?}\n\
                         The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
                         To view backtrace set RUST_LIB_BACKTRACE=1\n\
-                        {}",status,*crate::DEADLOCK_TIMEOUT,std::backtrace::Backtrace::capture()
+                        {}",status,config().deadlock_timeout,std::backtrace::Backtrace::capture()
                     );
                         // println!(
                         //     "[{:?}] ({:?})  hash: {:?}",
@@ -1246,14 +1247,14 @@ impl LamellarTeamRT {
             for pe in self.dropped.as_slice().expect("data should exist on pe") {
                 while *pe != 1 {
                     // std::thread::yield_now();
-                    if s.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+                    if s.elapsed().as_secs_f64() > config().deadlock_timeout {
                         println!("[WARNING]  Potential deadlock detected when trying to drop a LamellarTeam.\n\
                             The following indicates the dropped status on each PE: {:?}\n\
                             The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
                             To view backtrace set RUST_LIB_BACKTRACE=1\n\
                             {}",
                             self.dropped.as_slice(),
-                            *crate::DEADLOCK_TIMEOUT,
+                            config().deadlock_timeout,
                             std::backtrace::Backtrace::capture());
                         s = Instant::now();
                     }
@@ -1321,7 +1322,7 @@ impl LamellarTeamRT {
             // std::thread::yield_now();
             // self.flush();
             self.scheduler.exec_task(); //mmight as well do useful work while we wait
-            if temp_now.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "in team wait_all mype: {:?} cnt: {:?} {:?}",
                     self.world_pe,
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index e552578c..b7097551 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -1,4 +1,3 @@
-use crate::active_messaging::*;
 use crate::lamellae::{create_lamellae, Backend, Lamellae, LamellaeComm, LamellaeInit};
 use crate::lamellar_arch::LamellarArch;
 use crate::lamellar_env::LamellarEnv;
@@ -7,6 +6,7 @@ use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, RemoteMemoryRegion,
 };
 use crate::scheduler::{create_scheduler, ExecutorType};
+use crate::{active_messaging::*, config};
 // use log::trace;
 
 //use tracing::*;
@@ -381,61 +381,76 @@ impl LamellarWorldBuilder {
     pub fn new() -> LamellarWorldBuilder {
         // simple_logger::init().unwrap();
         // trace!("New world builder");
-        let executor = match std::env::var("LAMELLAR_EXECUTOR") {
-            Ok(val) => {
-                let executor = val.parse::<usize>().unwrap();
-                if executor == 0 {
-                    ExecutorType::LamellarWorkStealing
-                } else if executor == 1 {
-                    #[cfg(feature = "tokio-executor")]
-                    {
-                        ExecutorType::Tokio
-                    }
-                    #[cfg(not(feature = "tokio-executor"))]
-                    {
-                        println!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled,  defaulting to lamellar work stealing executor");
-                        ExecutorType::LamellarWorkStealing
-                    }
-                } else if executor == 2 {
-                    ExecutorType::LamellarWorkStealing2
-                } else if executor == 3 {
-                    ExecutorType::LamellarWorkStealing3
-                } else if executor == 4 {
-                    ExecutorType::AsyncStd
-                } else {
-                    println!("[LAMELLAR WARNING]: invalid executor selected defaulting to lamellar work stealing executor");
-                    ExecutorType::LamellarWorkStealing
-                }
-            }
-            Err(_) => {
-                #[cfg(feature = "tokio-executor")]
-                {
-                    ExecutorType::Tokio
-                }
+        let executor = match config().executor.as_str(){
+            "tokio" => {
                 #[cfg(not(feature = "tokio-executor"))]
                 {
-                    ExecutorType::LamellarWorkStealing
+                    panic!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled, either recompile lamellar with --features tokio-executor, or set LAMELLAR_EXECUTOR to one of 'lamellar' or 'async_std'");
                 }
+                ExecutorType::Tokio
             }
+            "async_std" => ExecutorType::AsyncStd,
+            "lamellar" => ExecutorType::LamellarWorkStealing,
+            "lamellar2" => ExecutorType::LamellarWorkStealing2,
+            "lamellar3" => ExecutorType::LamellarWorkStealing3,
+            _ => panic!("[LAMELLAR WARNING]: unexpected executor type, please set LAMELLAR_EXECUTOR to one of the following 'lamellar', 'async_std', or (if tokio-executor feature is enabled, 'tokio'.")
         };
+        // let executor = match std::env::var("LAMELLAR_EXECUTOR") {
+        //     Ok(val) => {
+        //         let executor = val.parse::<usize>().unwrap();
+        //         if executor == 0 {
+        //             ExecutorType::LamellarWorkStealing
+        //         } else if executor == 1 {
+        //             #[cfg(feature = "tokio-executor")]
+        //             {
+        //                 ExecutorType::Tokio
+        //             }
+        //             #[cfg(not(feature = "tokio-executor"))]
+        //             {
+        //                 println!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled,  defaulting to lamellar work stealing executor");
+        //                 ExecutorType::LamellarWorkStealing
+        //             }
+        //         } else if executor == 2 {
+        //             ExecutorType::LamellarWorkStealing2
+        //         } else if executor == 3 {
+        //             ExecutorType::LamellarWorkStealing3
+        //         } else if executor == 4 {
+        //             ExecutorType::AsyncStd
+        //         } else {
+        //             println!("[LAMELLAR WARNING]: invalid executor selected defaulting to lamellar work stealing executor");
+        //             ExecutorType::LamellarWorkStealing
+        //         }
+        //     }
+        //     Err(_) => {
+        //         #[cfg(feature = "tokio-executor")]
+        //         {
+        //             ExecutorType::Tokio
+        //         }
+        //         #[cfg(not(feature = "tokio-executor"))]
+        //         {
+        //             ExecutorType::LamellarWorkStealing
+        //         }
+        //     }
+        // };
         println!("executor: {:?}", executor);
 
-        let num_threads = match std::env::var("LAMELLAR_THREADS") {
-            Ok(n) => {
-                if let Ok(num_threads) = n.parse::<usize>() {
-                    if num_threads == 0 {
-                        panic!("LAMELLAR_THREADS must be greater than 0");
-                    } else if num_threads == 1 {
-                        num_threads
-                    } else {
-                        num_threads - 1
-                    }
-                } else {
-                    panic!("LAMELLAR_THREADS must be an integer greater than 0");
-                }
-            }
-            Err(_) => 4,
-        };
+        let num_threads = config().threads;
+        //     let num_threads = match std::env::var("LAMELLAR_THREADS") {
+        //     Ok(n) => {
+        //         if let Ok(num_threads) = n.parse::<usize>() {
+        //             if num_threads == 0 {
+        //                 panic!("LAMELLAR_THREADS must be greater than 0");
+        //             } else if num_threads == 1 {
+        //                 num_threads
+        //             } else {
+        //                 num_threads - 1
+        //             }
+        //         } else {
+        //             panic!("LAMELLAR_THREADS must be an integer greater than 0");
+        //         }
+        //     }
+        //     Err(_) => 4,
+        // };
         LamellarWorldBuilder {
             primary_lamellae: Default::default(),
             // secondary_lamellae: HashSet::new(),
diff --git a/src/lib.rs b/src/lib.rs
index 088c1758..bd840db1 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -282,6 +282,9 @@ mod utils;
 #[doc(hidden)]
 pub use utils::*;
 
+mod env_var;
+pub use env_var::config;
+
 pub use crate::lamellae::Backend;
 pub use crate::lamellar_arch::{BlockedArch, IdError, LamellarArch, StridedArch};
 #[doc(hidden)]
@@ -317,13 +320,6 @@ pub use custom_derive;
 #[doc(hidden)]
 pub use newtype_derive;
 
-lazy_static! {
-    pub(crate) static ref DEADLOCK_TIMEOUT: f64 = std::env::var("LAMELLAR_DEADLOCK_TIMEOUT")
-        .unwrap_or("600".to_string())
-        .parse::<usize>()
-        .unwrap_or(600) as f64;
-}
-
 lazy_static! {
     pub(crate) static ref BINCODE: bincode::config::WithOtherTrailing<bincode::DefaultOptions, bincode::config::AllowTrailing> =
         bincode::DefaultOptions::new().allow_trailing_bytes();
diff --git a/src/scheduler.rs b/src/scheduler.rs
index d74c2bdb..299e2797 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -3,6 +3,7 @@ use crate::active_messaging::batching::team_am_batcher::TeamAmBatcher;
 use crate::active_messaging::batching::BatcherType;
 use crate::active_messaging::registered_active_message::RegisteredActiveMessages;
 use crate::active_messaging::*;
+use crate::env_var::config;
 use crate::lamellae::{Des, Lamellae, SerializedData};
 
 use enum_dispatch::enum_dispatch;
@@ -406,7 +407,7 @@ impl Scheduler {
         //TODO maybe this should be > 2
         {
             //the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
-            if timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT {
+            if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "shurtdown timeout, tasks remaining: {:?} panic: {:?}",
                     self.num_tasks.load(Ordering::Relaxed),
@@ -464,29 +465,43 @@ pub(crate) fn create_scheduler(
         ExecutorType::Tokio => TokioRt::new(num_workers).into(),
     });
 
-    let batcher = match std::env::var("LAMELLAR_BATCHER") {
-        Ok(n) => {
-            let n = n.parse::<usize>().unwrap();
-            if n == 1 {
-                BatcherType::Simple(SimpleBatcher::new(
-                    num_pes,
-                    am_stall_mark.clone(),
-                    executor.clone(),
-                ))
-            } else {
-                BatcherType::TeamAm(TeamAmBatcher::new(
-                    num_pes,
-                    am_stall_mark.clone(),
-                    executor.clone(),
-                ))
-            }
-        }
-        Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(
+    let batcher = match config().batcher.as_str() {
+        "simple" => BatcherType::Simple(SimpleBatcher::new(
+            num_pes,
+            am_stall_mark.clone(),
+            executor.clone(),
+        )),
+        "team_am" => BatcherType::TeamAm(TeamAmBatcher::new(
             num_pes,
             am_stall_mark.clone(),
             executor.clone(),
         )),
+        _ => panic!("[LAMELLAR ERROR] unexpected batcher type please set LAMELLAR_BATCHER to one of 'simple' or 'team_am'")
     };
+
+    // let batcher = match std::env::var("LAMELLAR_BATCHER") {
+    //     Ok(n) => {
+    //         let n = n.parse::<usize>().unwrap();
+    //         if n == 1 {
+    //             BatcherType::Simple(SimpleBatcher::new(
+    //                 num_pes,
+    //                 am_stall_mark.clone(),
+    //                 executor.clone(),
+    //             ))
+    //         } else {
+    //             BatcherType::TeamAm(TeamAmBatcher::new(
+    //                 num_pes,
+    //                 am_stall_mark.clone(),
+    //                 executor.clone(),
+    //             ))
+    //         }
+    //     }
+    //     Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(
+    //         num_pes,
+    //         am_stall_mark.clone(),
+    //         executor.clone(),
+    //     )),
+    // };
     Scheduler::new(
         executor.clone(),
         RegisteredActiveMessages::new(batcher, executor),
diff --git a/src/scheduler/async_std_executor.rs b/src/scheduler/async_std_executor.rs
index 1de78415..e6104393 100644
--- a/src/scheduler/async_std_executor.rs
+++ b/src/scheduler/async_std_executor.rs
@@ -79,7 +79,7 @@ impl AsyncStdRt {
                 .with_thread_name_fn(Box::new(|| "lamellar_worker".to_string())),
         );
         Self {
-            max_num_threads: num_workers + 1,
+            max_num_threads: num_workers,
         }
     }
 }
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index de5f3b86..a6edad09 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -74,9 +74,9 @@ impl TokioRt {
     pub(crate) fn new(num_workers: usize) -> TokioRt {
         // println!("New TokioRT with {} workers", num_workers);
         TokioRt {
-            max_num_threads: num_workers + 1, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
+            max_num_threads: num_workers, //LAMELLAR_THREADS = num_workers + 1, so for tokio runtime, we actually want num_workers + 1 worker threads as block_on will not do anywork on the main thread (i think)...
             rt: tokio::runtime::Builder::new_multi_thread()
-                .worker_threads(num_workers + 1)
+                .worker_threads(num_workers)
                 .enable_all()
                 .build()
                 .unwrap(),
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index 8a078558..7bc1216a 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -1,3 +1,4 @@
+use crate::env_var::config;
 use crate::scheduler::{LamellarExecutor, SchedulerStatus};
 
 //use tracing::*;
@@ -81,7 +82,7 @@ impl WorkStealingThread {
 
                     if let Some(runnable) = omsg {
                         if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                            && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         {
                             println!("runnable {:?}", runnable);
                             println!(
@@ -95,7 +96,7 @@ impl WorkStealingThread {
                         runnable.run();
                     }
                     if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         && (worker.work_q.len() > 0 || worker.work_inj.len() > 0)
                     {
                         println!(
@@ -282,7 +283,7 @@ impl WorkStealing {
     ) -> WorkStealing {
         // println!("new work stealing queue");
         let mut ws = WorkStealing {
-            max_num_threads: num_workers,
+            max_num_threads: std::cmp::max(1,num_workers-1),// the main thread does work during blocking_ons and wait_alls
             threads: Vec::new(),
             imm_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_inj: Arc::new(crossbeam::deque::Injector::new()),
diff --git a/src/scheduler/work_stealing2.rs b/src/scheduler/work_stealing2.rs
index 33115003..64e958d0 100644
--- a/src/scheduler/work_stealing2.rs
+++ b/src/scheduler/work_stealing2.rs
@@ -1,3 +1,4 @@
+use crate::env_var::config;
 use crate::scheduler::{LamellarExecutor, SchedulerStatus};
 use crate::MAIN_THREAD;
 
@@ -109,7 +110,7 @@ impl WorkStealingThread {
 
                     if let Some(runnable) = omsg {
                         if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                            && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         {
                             println!("runnable {:?}", runnable);
                             println!(
@@ -123,7 +124,7 @@ impl WorkStealingThread {
                         runnable.run();
                     }
                     if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         && !worker.group_queue.is_empty()
                     {
                         println!(
@@ -170,7 +171,7 @@ impl IoThread {
                         .or_else(|| worker.io_inj.steal_batch_and_pop(&worker.io_q).success());
                     if let Some(runnable) = io_task {
                         if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                            && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         {
                             println!(
                                 "io_q size {:?} io inj size {:?} ", // num_tasks {:?}",
@@ -184,7 +185,7 @@ impl IoThread {
                     }
 
                     if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         && (worker.io_q.len() > 0 || worker.io_inj.len() > 0)
                     {
                         println!(
@@ -366,6 +367,7 @@ impl WorkStealing2 {
         panic: Arc<AtomicU8>,
     ) -> WorkStealing2 {
         // println!("new work stealing queue");
+        let num_workers =  std::cmp::max(1,num_workers-1);
         let mut num_threads_per_group = match std::env::var("LAMELLAR_WS2_THREADS") {
             Ok(s) => {
                 if let Ok(num) = s.parse::<usize>() {
@@ -376,8 +378,8 @@ impl WorkStealing2 {
             }
             _ => 4,
         };
-        if num_threads_per_group > num_workers {
-            num_threads_per_group = num_workers
+        if num_threads_per_group > num_workers  {
+            num_threads_per_group = num_workers 
         }
 
         let mut ws = WorkStealing2 {
diff --git a/src/scheduler/work_stealing3.rs b/src/scheduler/work_stealing3.rs
index 8de0f992..68e56a8f 100644
--- a/src/scheduler/work_stealing3.rs
+++ b/src/scheduler/work_stealing3.rs
@@ -1,3 +1,4 @@
+use crate::env_var::config;
 use crate::scheduler::{LamellarExecutor, SchedulerStatus};
 use crate::MAIN_THREAD;
 
@@ -93,7 +94,7 @@ impl WorkStealingThread {
 
                     if let Some(runnable) = omsg {
                         if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                            && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                            && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         {
                             println!("runnable {:?}", runnable);
                             println!(
@@ -107,7 +108,7 @@ impl WorkStealingThread {
                         runnable.run();
                     }
                     if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                        && timer.elapsed().as_secs_f64() > *crate::DEADLOCK_TIMEOUT
+                        && timer.elapsed().as_secs_f64() > config().deadlock_timeout
                         && (work_q.len() > 0 || worker.work_inj.len() > 0)
                     {
                         println!(
@@ -307,7 +308,7 @@ impl WorkStealing3 {
     ) -> WorkStealing3 {
         // println!("new work stealing queue");
         let mut ws = WorkStealing3 {
-            max_num_threads: num_workers,
+            max_num_threads: std::cmp::max(1,num_workers-1),
             threads: Vec::new(),
             imm_inj: Arc::new(Injector::new()),
             work_inj: Arc::new(Injector::new()),

From d3d2e21b610fe02dffd3525d67eea570fc9108aa Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 11 Apr 2024 11:32:31 -0700
Subject: [PATCH 019/116] convert unwraps to expects for better error messages

---
 impl/src/array_reduce.rs       |  7 +------
 impl/src/gen_am.rs             | 16 ++++++++--------
 impl/src/gen_am_group.rs       |  6 +++---
 src/barrier.rs                 |  2 ++
 src/darc.rs                    | 23 +++++++++++++++--------
 src/lamellae/command_queues.rs | 10 ++++++++--
 src/lamellae/rofi/rofi_api.rs  |  4 ++--
 src/lamellar_task_group.rs     | 12 +++++++++---
 src/lamellar_world.rs          |  1 +
 9 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/impl/src/array_reduce.rs b/impl/src/array_reduce.rs
index aecb6aa7..0bae8a5a 100644
--- a/impl/src/array_reduce.rs
+++ b/impl/src/array_reduce.rs
@@ -85,12 +85,9 @@ fn create_reduction(
                         let timer = std::time::Instant::now();
                         #[allow(unused_unsafe)]
                         let data_slice = unsafe { #data_slice};
-                        // let first = data_slice.first().unwrap().clone();
-                        // let res = data_slice[1..].iter().fold(first, #op );
-                        let res = data_slice.iter()#iter_chain.reduce(#op).unwrap();
+                        let res = data_slice.iter()#iter_chain.reduce(#op).expect("length of slice should be greater than 0");
                         // println!("[{:?}] {:?} {:?}",__lamellar_current_pe,res,timer.elapsed().as_secs_f64());
                         res
-                        // data_slice.first().unwrap().clone()
                     }
                     else{
                         // println!("[{:?}] recurse {:?} {:?}",__lamellar_current_pe,self.start_pe, self.end_pe);
@@ -103,8 +100,6 @@ fn create_reduction(
 
                         // println!("[{:?}] {:?} {:?}",__lamellar_current_pe,res,timer.elapsed().as_secs_f64());
                         res
-                        // let data_slice = unsafe {self.data.local_data()};
-                        // data_slice.first().unwrap().clone()
                     }
                 }
             }
diff --git a/impl/src/gen_am.rs b/impl/src/gen_am.rs
index ee8edfdc..088aef1a 100644
--- a/impl/src/gen_am.rs
+++ b/impl/src/gen_am.rs
@@ -72,10 +72,10 @@ pub(crate) fn impl_lamellar_serde_trait(
                 #lamellar::serialized_size(self,true)
             }
             fn serialize_into(&self,buf: &mut [u8]){
-                #lamellar::serialize_into(buf,self,true).unwrap();
+                #lamellar::serialize_into(buf,self,true).expect("can serialize and enough space in buf");
             }
             fn serialize(&self)->Vec<u8>{
-                #lamellar::serialize(self,true).unwrap()
+                #lamellar::serialize(self,true).expect("can serialize")
             }
         }
     }
@@ -93,10 +93,10 @@ fn impl_return_lamellar_serde_trait(
                 #lamellar::serialized_size(&self.val,true)
             }
             fn serialize_into(&self,buf: &mut [u8]){
-                #lamellar::serialize_into(buf,&self.val,true).unwrap();
+                #lamellar::serialize_into(buf,&self.val,true).expect("can serialize and enough space in buf");
             }
             fn serialize(&self)->Vec<u8>{
-                #lamellar::serialize(self,true).unwrap()
+                #lamellar::serialize(self,true).expect("can serialize")
             }
         }
     }
@@ -112,12 +112,12 @@ pub(crate) fn impl_lamellar_result_serde_trait(
     quote! {
         impl #impl_generics #lamellar::active_messaging::LamellarResultSerde for #am_name #ty_generics #where_clause {
             fn serialized_result_size(&self,result: & Box<dyn std::any::Any + Sync + Send>)->usize{
-                let result  = result.downcast_ref::<#ret_type>().unwrap();
+                let result  = result.downcast_ref::<#ret_type>().expect("can downcast result box");
                 #lamellar::serialized_size(result,true)
             }
             fn serialize_result_into(&self,buf: &mut [u8],result: & Box<dyn std::any::Any + Sync + Send>){
-                let result  = result.downcast_ref::<#ret_type>().unwrap();
-                #lamellar::serialize_into(buf,result,true).unwrap();
+                let result  = result.downcast_ref::<#ret_type>().expect("can downcast result box");
+                #lamellar::serialize_into(buf,result,true).expect("can serialize and enough size in buf");
             }
         }
     }
@@ -157,7 +157,7 @@ fn impl_unpack_and_register_function(
     let am_name_unpack = quote::format_ident!("{}_unpack", am_name.clone());
     quote! {
         fn #am_name_unpack #impl_generics (bytes: &[u8], cur_pe: Result<usize,#lamellar::IdError>) -> std::sync::Arc<dyn #lamellar::active_messaging::RemoteActiveMessage + Sync + Send>  {
-            let __lamellar_data: std::sync::Arc<#am_name #ty_generics> = std::sync::Arc::new(#lamellar::deserialize(&bytes,true).unwrap());
+            let __lamellar_data: std::sync::Arc<#am_name #ty_generics> = std::sync::Arc::new(#lamellar::deserialize(&bytes,true).expect("can deserialize into remote active message"));
             <#am_name #ty_generics as #lamellar::active_messaging::DarcSerde>::des(&__lamellar_data,cur_pe);
             __lamellar_data
         }
diff --git a/impl/src/gen_am_group.rs b/impl/src/gen_am_group.rs
index d2bf9bf4..c8f6f89f 100644
--- a/impl/src/gen_am_group.rs
+++ b/impl/src/gen_am_group.rs
@@ -156,8 +156,8 @@ fn gen_am_group_return_stmt(
                     },
                     quote! {
                             match __local{
-                            true => #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new (__am_group.unwrap())),
-                            false => #lamellar::active_messaging::LamellarReturn::RemoteAm(std::sync::Arc::new (__am_group.unwrap())),
+                            true => #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new (__am_group.expect("am group should exsit"))),
+                            false => #lamellar::active_messaging::LamellarReturn::RemoteAm(std::sync::Arc::new (__am_group.expect("am group should exsit"))),
                         }
                     },
                 )
@@ -176,7 +176,7 @@ fn gen_am_group_return_stmt(
                         }
                     },
                     quote! {
-                        #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new (__am_group.unwrap()))
+                        #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new (__am_group.expect("am group should exsit")))
                     },
                 )
             }
diff --git a/src/barrier.rs b/src/barrier.rs
index cd11aef5..3827f562 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -204,6 +204,7 @@ impl Barrier {
                                 //             .expect("Data should exist on PE")
                                 //     }
                                 // );
+                                println!("barrier put_slice 1");
                                 unsafe {
                                     self.barrier_buf[i - 1].put_slice(
                                         send_pe,
@@ -335,6 +336,7 @@ impl Barrier {
                                 //             .expect("Data should exist on PE")
                                 //     }
                                 // );
+                                println!("barrier put_slice 2");
                                 unsafe {
                                     self.barrier_buf[i - 1].put_slice(
                                         send_pe,
diff --git a/src/darc.rs b/src/darc.rs
index 2e681b0a..119c6029 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -447,16 +447,20 @@ impl<T> DarcInner<T> {
 
             // let rel_addr = inner.inner.as_ptr() as *const _ as usize - team.lamellae.base_addr();
 
-            // println!(
-            //     "[{:?}] entering initial block_on barrier()",
-            //     std::thread::current().id()
-            // );
+            while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
+                async_std::task::yield_now().await;
+            }
+
+            println!(
+                "[{:?}] entering initial block_on barrier()",
+                std::thread::current().id()
+            );
             let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
             barrier_fut.await;
-            // println!(
-            //     "[{:?}] leaving initial block_on barrier()",
-            //     std::thread::current().id()
-            // );
+            println!(
+                "[{:?}] leaving initial block_on barrier()",
+                std::thread::current().id()
+            );
 
             while outstanding_refs {
                 outstanding_refs = false;
@@ -496,6 +500,7 @@ impl<T> DarcInner<T> {
                         //     inner.my_pe * std::mem::size_of::<usize>(),
                         //     inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>()
                         // );
+                        println!("darc block_on_outstanding put 1");
                         rdma.put(
                             send_pe,
                             ref_cnt_u8,
@@ -565,6 +570,7 @@ impl<T> DarcInner<T> {
                             std::mem::size_of::<usize>(),
                         )
                     };
+                    println!("darc block_on_outstanding put 2");
                     rdma.put(
                         send_pe,
                         barrier_id_slice,
@@ -614,6 +620,7 @@ impl<T> DarcInner<T> {
             };
             let rdma = &team.lamellae;
             for pe in team.arch.team_iter() {
+                println!("darc block_on_outstanding put 3");
                 rdma.put(
                     pe,
                     &mode_refs[inner.my_pe..=inner.my_pe],
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 52a294e8..044334ee 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -675,6 +675,7 @@ impl InnerCQ {
                     let recv_buffer = self.recv_buffer.lock();
                     // println! {"sending data to dst {:?} {:?} {:?} {:?}",recv_buffer[self.my_pe].as_addr()-self.comm.base_addr(),send_buf[dst],send_buf[dst].as_bytes(),send_buf};
                     // println!("sending cmd {:?}", send_buf);
+                    println!("Command Queue sending buffer");
                     self.comm.put(
                         dst,
                         send_buf[dst].as_bytes(),
@@ -829,7 +830,7 @@ impl InnerCQ {
                 cmd.calc_hash();
                 for pe in 0..self.num_pes {
                     if pe != self.my_pe {
-                        // println!("putting alloc cmd to pe {:?}", pe);
+                        println!("putting alloc cmd to pe {:?}", pe);
                         self.comm.put(pe, cmd.as_bytes(), cmd.as_addr());
                     }
                 }
@@ -857,7 +858,7 @@ impl InnerCQ {
             cmd.calc_hash();
             for pe in 0..self.num_pes {
                 if pe != self.my_pe {
-                    // println!("putting clear cmd to pe {:?}", pe);
+                    println!("putting clear cmd to pe {:?}", pe);
                     self.comm.put(pe, cmd.as_bytes(), cmd.as_addr());
                 }
             }
@@ -900,6 +901,7 @@ impl InnerCQ {
         // let cmd_buffer = self.cmd_buffers[dst].lock();
         // println!("sending release: {:?} cmd: {:?} {:?} {:?} 0x{:x} 0x{:x}",self.release_cmd,cmd,self.release_cmd.cmd_as_bytes(), cmd.cmd_as_bytes(),self.release_cmd.cmd_as_addr(),cmd.daddr + offset_of!(CmdMsg,cmd));
         let local_daddr = self.comm.local_addr(dst, cmd.daddr);
+        println!("sending release to {dst}");
         self.comm.put(
             dst,
             self.release_cmd.cmd_as_bytes(),
@@ -912,6 +914,7 @@ impl InnerCQ {
         // let cmd_buffer = self.cmd_buffers[dst].lock();
         // println!("sending release: {:?} cmd: {:?} {:?} {:?} 0x{:x} 0x{:x}",self.release_cmd,cmd,self.release_cmd.cmd_as_bytes(), cmd.cmd_as_bytes(),self.release_cmd.cmd_as_addr(),cmd.daddr + offset_of!(CmdMsg,cmd));
         let local_daddr = self.comm.local_addr(dst, cmd.daddr);
+        println!("sending free to {dst}");
         self.comm.put(
             dst,
             self.free_cmd.cmd_as_bytes(),
@@ -935,6 +938,7 @@ impl InnerCQ {
                     // println!("sending print {:?} (s: {:?} r: {:?})",addr,self.sent_cnt.load(Ordering::SeqCst),self.recv_cnt.load(Ordering::SeqCst));
                     let recv_buffer = self.recv_buffer.lock();
                     // println!("sending cmd {:?}",send_buf);
+                    println!("sending print to {dst}");
                     self.comm.put(
                         dst,
                         send_buf[dst].as_bytes(),
@@ -976,6 +980,7 @@ impl InnerCQ {
     //#[tracing::instrument(skip_all)]
     async fn get_data(&self, src: usize, cmd: CmdMsg, data_slice: &mut [u8]) {
         let local_daddr = self.comm.local_addr(src, cmd.daddr);
+        println!("command queue getting data from {src}");
         self.comm.iget(src, local_daddr as usize, data_slice);
         // self.get_amt.fetch_add(data_slice.len(),Ordering::Relaxed);
         let mut timer = std::time::Instant::now();
@@ -1004,6 +1009,7 @@ impl InnerCQ {
     async fn get_serialized_data(&self, src: usize, cmd: CmdMsg, ser_data: &SerializedData) {
         let data_slice = ser_data.header_and_data_as_bytes();
         let local_daddr = self.comm.local_addr(src, cmd.daddr);
+        println!("command queue getting serialized data from {src}");
         self.comm.iget(src, local_daddr as usize, data_slice);
         // self.get_amt.fetch_add(data_slice.len(),Ordering::Relaxed);
         let mut timer = std::time::Instant::now();
diff --git a/src/lamellae/rofi/rofi_api.rs b/src/lamellae/rofi/rofi_api.rs
index 3e19e632..4fb245cd 100644
--- a/src/lamellae/rofi/rofi_api.rs
+++ b/src/lamellae/rofi/rofi_api.rs
@@ -5,8 +5,8 @@ use std::ffi::CString;
 use std::os::raw::c_ulong;
 
 pub(crate) fn rofi_init(provider: &str) -> Result<(), &'static str> {
-    let c_str = CString::new(provider).unwrap();
-    let retval = unsafe { rofisys::rofi_init(c_str.as_ptr() as *mut _) as i32 };
+    let prov_str = CString::new(provider).unwrap();
+    let retval = unsafe { rofisys::rofi_init(prov_str.as_ptr() as *mut _, 0 as *mut _) as i32 };
     if retval == 0 {
         Ok(())
     } else {
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 0218d914..18bec1d8 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -300,7 +300,9 @@ impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
             .expect("req sub id should exist");
         let mut res = Vec::new();
         for pe in 0..sub_id_map.len() {
-            res.push(self.process_result(sub_id_map.remove(&pe).unwrap()));
+            res.push(
+                self.process_result(sub_id_map.remove(&pe).expect("pe id should exist still")),
+            );
         }
         res
     }
@@ -336,7 +338,9 @@ impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
             .expect("req sub id should exist");
         let mut res = Vec::new();
         for pe in 0..sub_id_map.len() {
-            res.push(self.process_result(sub_id_map.remove(&pe).unwrap()));
+            res.push(
+                self.process_result(sub_id_map.remove(&pe).expect("pe id should exist still")),
+            );
         }
         res
     }
@@ -355,7 +359,9 @@ impl<T: AmDist> Future for TaskGroupMultiAmHandle<T> {
                 .expect("req sub id should exist");
             let mut res = Vec::new();
             for pe in 0..sub_id_map.len() {
-                res.push(this.process_result(sub_id_map.remove(&pe).unwrap()));
+                res.push(
+                    this.process_result(sub_id_map.remove(&pe).expect("pe id should exist still")),
+                );
             }
             Poll::Ready(res)
         } else {
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index b7097551..6a1e9f0c 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -387,6 +387,7 @@ impl LamellarWorldBuilder {
                 {
                     panic!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled, either recompile lamellar with --features tokio-executor, or set LAMELLAR_EXECUTOR to one of 'lamellar' or 'async_std'");
                 }
+                #[cfg(feature = "tokio-executor")]
                 ExecutorType::Tokio
             }
             "async_std" => ExecutorType::AsyncStd,

From 587023cd569ca01721451c854cdb38497e7f723b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 11 Apr 2024 13:32:59 -0700
Subject: [PATCH 020/116] add missing file

---
 src/env_var.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100644 src/env_var.rs

diff --git a/src/env_var.rs b/src/env_var.rs
new file mode 100644
index 00000000..86eaf908
--- /dev/null
+++ b/src/env_var.rs
@@ -0,0 +1,108 @@
+use std::sync::OnceLock;
+
+use serde::Deserialize;
+
+fn default_deadlock_timeout() -> f64 {
+    600.0
+}
+
+fn default_op_batch() -> usize {
+    10000
+}
+
+fn default_dissemination_factor() -> usize {
+    2
+}
+
+fn default_backend() -> String {
+    #[cfg(feature = "enable-rofi")]
+    return "rofi".to_owned();
+    #[cfg(not(feature = "enable-rofi"))]
+    return "local".to_owned();
+}
+
+fn default_executor() -> String {
+    #[cfg(feature = "tokio-executor")]
+    return "tokio".to_owned();
+    #[cfg(not(feature = "tokio-executor"))]
+    return "lamellar".to_owned();
+}
+
+fn default_batcher() -> String {
+    "simple".to_owned()
+}
+
+fn default_threads() -> usize {
+    match std::thread::available_parallelism() {
+        Ok(n) => n.into(),
+        Err(_) => 4,
+    }
+}
+
+#[derive(Deserialize, Debug, PartialEq)]
+#[serde(rename_all = "lowercase")]
+pub enum HeapMode {
+    Static,
+    Dynamic,
+}
+
+fn default_heap_mode() -> HeapMode {
+    HeapMode::Static
+}
+
+#[derive(Deserialize, Debug, PartialEq)]
+#[serde(rename_all = "lowercase")]
+pub enum Alloc {
+    Heap,
+    Lamellae,
+}
+
+fn default_alloc() -> Alloc {
+    Alloc::Heap
+}
+
+#[derive(Deserialize, Debug, PartialEq)]
+#[serde(rename_all = "lowercase")]
+pub enum IndexType {
+    Static,
+    Dynamic,
+}
+fn default_array_dynamic_index() -> IndexType {
+    IndexType::Dynamic
+}
+
+#[derive(Deserialize, Debug)]
+pub struct Config {
+    #[serde(default = "default_deadlock_timeout")]
+    pub deadlock_timeout: f64,
+    #[serde(default = "default_op_batch")]
+    pub batch_op_size: usize,
+    #[serde(default = "default_dissemination_factor")]
+    pub barrier_dissemination_factor: usize,
+    // #[serde(default=true)]
+    pub barrier_warning: Option<bool>,
+    #[serde(default = "default_backend")]
+    pub backend: String, //rofi,shmem,local
+    #[serde(default = "default_executor")]
+    pub executor: String, //lamellar,tokio,async_std
+    #[serde(default = "default_batcher")]
+    pub batcher: String,
+    #[serde(default = "default_threads")]
+    pub threads: usize,
+    pub batch_op_threads: Option<usize>,
+    pub heap_size: Option<usize>,
+    #[serde(default = "default_heap_mode")]
+    pub heap_mode: HeapMode,
+    #[serde(default = "default_alloc")]
+    pub alloc: Alloc,
+    #[serde(default = "default_array_dynamic_index")]
+    pub index_size: IndexType,
+}
+
+pub fn config() -> &'static Config {
+    static CONFIG: OnceLock<Config> = OnceLock::new();
+    CONFIG.get_or_init(|| match envy::prefixed("LAMELLAR_").from_env::<Config>() {
+        Ok(config) => config,
+        Err(error) => panic!("{}", error),
+    })
+}

From e7cdc54e7705c972e737803b8053e8fde715ba15 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Mon, 15 Apr 2024 12:25:14 -0700
Subject: [PATCH 021/116] return option<T> for array reduce ops

---
 impl/src/array_reduce.rs                      | 15 +++++++++++---
 src/array/atomic.rs                           | 10 +++++-----
 src/array/generic_atomic.rs                   | 10 +++++-----
 src/array/global_lock_atomic.rs               |  4 ++--
 .../distributed_iterator/consumer/sum.rs      | 14 +++++++++++--
 src/array/local_lock_atomic.rs                |  4 ++--
 src/array/native_atomic.rs                    | 10 +++++-----
 src/array/read_only.rs                        | 10 +++++-----
 src/array/unsafe.rs                           | 20 +++++++++++--------
 9 files changed, 60 insertions(+), 37 deletions(-)

diff --git a/impl/src/array_reduce.rs b/impl/src/array_reduce.rs
index 0bae8a5a..cc76bfe6 100644
--- a/impl/src/array_reduce.rs
+++ b/impl/src/array_reduce.rs
@@ -78,14 +78,14 @@ fn create_reduction(
 
             #[#am]
             impl LamellarAM for #reduction_name{
-                async fn exec(&self) -> #typeident{
+                async fn exec(&self) -> Option<#typeident>{
                     // println!("{}",stringify!(#array_type));
                     if self.start_pe == self.end_pe{
                         // println!("[{:?}] root {:?} {:?}",__lamellar_current_pe,self.start_pe, self.end_pe);
                         let timer = std::time::Instant::now();
                         #[allow(unused_unsafe)]
                         let data_slice = unsafe { #data_slice};
-                        let res = data_slice.iter()#iter_chain.reduce(#op).expect("length of slice should be greater than 0");
+                        let res = data_slice.iter()#iter_chain.reduce(#op);//s.expect("length of slice should be greater than 0");
                         // println!("[{:?}] {:?} {:?}",__lamellar_current_pe,res,timer.elapsed().as_secs_f64());
                         res
                     }
@@ -96,7 +96,16 @@ fn create_reduction(
                         let timer = std::time::Instant::now();
                         let left = __lamellar_team.exec_am_pe( self.start_pe,  #reduction_name { data: self.data.clone(), start_pe: self.start_pe, end_pe: mid_pe});//;
                         let right = __lamellar_team.exec_am_pe( mid_pe+1, #reduction_name { data: self.data.clone(), start_pe: mid_pe+1, end_pe: self.end_pe});//;
-                        let res = op(left.await,right.await);
+                        let left = left.await;
+                        let right = right.await;
+
+                        let res = match (left,right){
+                            (None,None) => None,
+                            (Some(v),None) => Some(v),
+                            (None,Some(v)) => Some(v),
+                            (Some(v1),Some(v2)) => Some(op(v1,v2))
+                        };
+
 
                         // println!("[{:?}] {:?} {:?}",__lamellar_current_pe,res,timer.elapsed().as_secs_f64());
                         res
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index f7734768..2890dce1 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1168,7 +1168,7 @@ impl<T: Dist> From<AtomicByteArray> for AtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> AtomicArray<T> {
-    pub fn reduce(&self, reduction: &str) -> AmHandle<T> {
+    pub fn reduce(&self, reduction: &str) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
             AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
@@ -1177,13 +1177,13 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
-    pub fn sum(&self) -> AmHandle<T> {
+    pub fn sum(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.sum(),
             AtomicArray::GenericAtomicArray(array) => array.sum(),
         }
     }
-    pub fn prod(&self) -> AmHandle<T> {
+    pub fn prod(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.prod(),
             AtomicArray::GenericAtomicArray(array) => array.prod(),
@@ -1191,13 +1191,13 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
-    pub fn max(&self) -> AmHandle<T> {
+    pub fn max(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.max(),
             AtomicArray::GenericAtomicArray(array) => array.max(),
         }
     }
-    pub fn min(&self) -> AmHandle<T> {
+    pub fn min(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.min(),
             AtomicArray::GenericAtomicArray(array) => array.min(),
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index d2ffbef4..ed31f5f4 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -825,23 +825,23 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GenericAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> GenericAtomicArray<T> {
-    pub fn reduce(&self, op: &str) -> AmHandle<T> {
+    pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GenericAtomicArray<T> {
-    pub fn sum(&self) -> AmHandle<T> {
+    pub fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
-    pub fn prod(&self) -> AmHandle<T> {
+    pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GenericAtomicArray<T> {
-    pub fn max(&self) -> AmHandle<T> {
+    pub fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
-    pub fn min(&self) -> AmHandle<T> {
+    pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 8aa7b58a..325ee9ad 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -1029,7 +1029,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GlobalLockArray<T> {
 #[doc(hidden)]
 #[pin_project]
 pub struct GlobalLockArrayReduceHandle<T: Dist + AmDist> {
-    req: AmHandle<T>,
+    req: AmHandle<Option<T>>,
     lock_guard: GlobalRwDarcReadGuard<()>,
 }
 
@@ -1046,7 +1046,7 @@ impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist> Future for GlobalLockArrayReduceHandle<T> {
-    type Output = T;
+    type Output = Option<T>;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let this = self.project();
         match this.req.ready_or_set_waker(cx.waker()) {
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index c9cd6b31..f6822835 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -92,7 +92,12 @@ where
         local_sums.async_barrier().await;
         // let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
         // buffered_iter.into_iter().map(|&e| e).sum()
-        unsafe { local_sums.sum().await }
+        unsafe {
+            local_sums
+                .sum()
+                .await
+                .expect("array size is greater than zero")
+        }
     }
 
     fn reduce_remote_vals(&self, local_sum: T, local_sums: UnsafeArray<T>) -> T {
@@ -102,7 +107,12 @@ where
         local_sums.tasking_barrier();
         // let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
         // buffered_iter.into_iter().map(|&e| e).sum()
-        unsafe { local_sums.sum().blocking_wait() }
+        unsafe {
+            local_sums
+                .sum()
+                .blocking_wait()
+                .expect("array size is greater than zero")
+        }
     }
 }
 
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index a7e20d33..89210db3 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -927,7 +927,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
 #[doc(hidden)]
 #[pin_project]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
-    req: AmHandle<T>,
+    req: AmHandle<Option<T>>,
     lock_guard: Arc<RwLockReadGuardArc<()>>,
 }
 
@@ -944,7 +944,7 @@ impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist> Future for LocalLockArrayReduceHandle<T> {
-    type Output = T;
+    type Output = Option<T>;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let this = self.project();
         match this.req.ready_or_set_waker(cx.waker()) {
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 5dde394a..1b971f85 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1232,23 +1232,23 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for NativeAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> NativeAtomicArray<T> {
-    pub fn reduce(&self, op: &str) -> AmHandle<T> {
+    pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> NativeAtomicArray<T> {
-    pub fn sum(&self) -> AmHandle<T> {
+    pub fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
-    pub fn prod(&self) -> AmHandle<T> {
+    pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> NativeAtomicArray<T> {
-    pub fn max(&self) -> AmHandle<T> {
+    pub fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
-    pub fn min(&self) -> AmHandle<T> {
+    pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 7cae7245..81659d0d 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -489,23 +489,23 @@ impl<T: Dist> From<LamellarByteArray> for ReadOnlyArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
-    pub fn reduce(&self, op: &str) -> AmHandle<T> {
+    pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
-    pub fn sum(&self) -> AmHandle<T> {
+    pub fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
-    pub fn prod(&self) -> AmHandle<T> {
+    pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
-    pub fn max(&self) -> AmHandle<T> {
+    pub fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
-    pub fn min(&self) -> AmHandle<T> {
+    pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
 }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 30ef0d8d..3f04148d 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1100,16 +1100,20 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
             .get(&(std::any::TypeId::of::<T>(), op))
             .expect("unexpected reduction type")(byte_array, self.inner.data.team.num_pes())
     }
-    pub(crate) fn reduce_data(&self, op: &str, byte_array: LamellarByteArray) -> AmHandle<T> {
+    pub(crate) fn reduce_data(
+        &self,
+        op: &str,
+        byte_array: LamellarByteArray,
+    ) -> AmHandle<Option<T>> {
         let func = self.get_reduction_op(op, byte_array);
         if let Ok(my_pe) = self.inner.data.team.team_pe_id() {
-            self.inner.data.team.exec_arc_am_pe::<T>(
+            self.inner.data.team.exec_arc_am_pe::<Option<T>>(
                 my_pe,
                 func,
                 Some(self.inner.data.array_counters.clone()),
             )
         } else {
-            self.inner.data.team.exec_arc_am_pe::<T>(
+            self.inner.data.team.exec_arc_am_pe::<Option<T>>(
                 0,
                 func,
                 Some(self.inner.data.array_counters.clone()),
@@ -1153,7 +1157,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
     ///```
-    pub unsafe fn reduce(&self, op: &str) -> AmHandle<T> {
+    pub unsafe fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.reduce_data(op, self.clone().into())
     }
 
@@ -1189,7 +1193,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(unsafe{array.sum()}); //Safe in this instance as we have ensured no updates are currently happening
     /// // assert_eq!(array.len()*num_pes,sum);//this may or may not fail
     ///```
-    pub unsafe fn sum(&self) -> AmHandle<T> {
+    pub unsafe fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
 
@@ -1226,7 +1230,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
-    pub unsafe fn prod(&self) -> AmHandle<T> {
+    pub unsafe fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 
@@ -1257,7 +1261,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let max = array.block_on(max_req);
     /// assert_eq!((array.len()-1)*2,max);
     ///```
-    pub unsafe fn max(&self) -> AmHandle<T> {
+    pub unsafe fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
 
@@ -1288,7 +1292,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let min = array.block_on(min_req);
     /// assert_eq!(0,min);
     ///```
-    pub unsafe fn min(&self) -> AmHandle<T> {
+    pub unsafe fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
 }

From a806a22215ceb6a26990177dedcdd5348291e4a5 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Apr 2024 14:53:57 -0700
Subject: [PATCH 022/116] add test for 1 element reduce

---
 examples/array_examples/dist_array_reduce.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index 15312ad9..470208b9 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -158,4 +158,9 @@ fn main() {
     // block_array.dist_iter().for_each(|x| println!("x: {:?}", x));
     // block_array.for_each(|x| println!("x: {:?}", x));
     // cyclic_array.for_each_mut(|x| *x += *x);
+
+    let one_elem_array = UnsafeArray::<usize>::new(world.team(), 1, Distribution::Block);
+    let min = unsafe { one_elem_array.min() };
+    let min = one_elem_array.block_on(min);
+    println!("one elem array min: {min:?}");
 }

From b7988fc29dfbcdaf95975f4062a1106a2fdb819b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Apr 2024 14:55:36 -0700
Subject: [PATCH 023/116] add serde_bytes to Vec<u8> for better ser/de

---
 impl/src/array_ops.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index c701473e..bb5b6fe9 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -854,6 +854,7 @@ fn create_buf_ops(
             struct #multi_val_multi_idx_am_buf_name{
                 data: #lamellar::array::#array_type<#typeident>,
                 op: #lamellar::array::ArrayOpCmd<#typeident>,
+                #[serde(with = "serde_bytes")]
                 idx_vals: Vec<u8>,
                 index_size: u8,
             }
@@ -918,6 +919,7 @@ fn create_buf_ops(
                 data: #lamellar::array::#array_type<#typeident>,
                 op: #lamellar::array::ArrayOpCmd<#typeident>,
                 val: #typeident,
+                #[serde(with = "serde_bytes")]
                 indices: Vec<u8>,
                 index_size: u8,
             }
@@ -986,6 +988,7 @@ fn create_buf_ops(
             struct #multi_val_single_idx_am_buf_name{
                 data: #lamellar::array::#array_type<#typeident>,
                 op: #lamellar::array::ArrayOpCmd<#typeident>,
+                #[serde(with = "serde_bytes")]
                 vals: Vec<u8>,
                 index: usize,
             }
@@ -1026,6 +1029,7 @@ fn create_buf_ops(
                 struct #multi_val_multi_idx_am_buf_result_name{
                     data: #lamellar::array::#array_type<#typeident>,
                     op: #lamellar::array::ArrayOpCmd<#typeident>,
+                    #[serde(with = "serde_bytes")]
                     idx_vals: Vec<u8>,
                     index_size: u8,
                 }
@@ -1092,6 +1096,7 @@ fn create_buf_ops(
                     data: #lamellar::array::#array_type<#typeident>,
                     op: #lamellar::array::ArrayOpCmd<#typeident>,
                     val: #typeident,
+                    #[serde(with = "serde_bytes")]
                     indices: Vec<u8>,
                     index_size: u8,
                 }
@@ -1161,6 +1166,7 @@ fn create_buf_ops(
                 struct #multi_val_single_idx_am_buf_result_name{
                     data: #lamellar::array::#array_type<#typeident>,
                     op: #lamellar::array::ArrayOpCmd<#typeident>,
+                    #[serde(with = "serde_bytes")]
                     vals: Vec<u8>,
                     index: usize,
                 }
@@ -1204,6 +1210,7 @@ fn create_buf_ops(
         struct #multi_val_multi_idx_am_buf_fetch_name{
             data: #lamellar::array::#array_type<#typeident>,
             op: #lamellar::array::ArrayOpCmd<#typeident>,
+            #[serde(with = "serde_bytes")]
             idx_vals: Vec<u8>,
             index_size: u8,
         }
@@ -1273,6 +1280,7 @@ fn create_buf_ops(
             data: #lamellar::array::#array_type<#typeident>,
             op: #lamellar::array::ArrayOpCmd<#typeident>,
             val: #typeident,
+            #[serde(with = "serde_bytes")]
             indices: Vec<u8>,
             index_size: u8,
         }
@@ -1347,6 +1355,7 @@ fn create_buf_ops(
         struct #multi_val_single_idx_am_buf_fetch_name{
             data: #lamellar::array::#array_type<#typeident>,
             op: #lamellar::array::ArrayOpCmd<#typeident>,
+            #[serde(with = "serde_bytes")]
             vals: Vec<u8>,
             index: usize,
         }

From 6a7ffbaeb72afe8883150818c35b0fd27bebf7de Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Apr 2024 14:56:31 -0700
Subject: [PATCH 024/116] update setting env var

---
 tests/array/arithmetic_ops/fetch_add_test.rs    | 2 +-
 tests/array/atomic_ops/compare_exchange_test.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 5c3edf99..c1a27f03 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -392,7 +392,7 @@ macro_rules! check_results {
 macro_rules! input_test{
     ($array:ident,  $len:expr, $dist:ident) =>{
        {
-            std::env::set_var("LAMELLAR_OP_BATCH","10");
+            std::env::set_var("LAMELLAR_BATCH_OP_SIZE","10");
             let world = lamellar::LamellarWorldBuilder::new().build();
             let num_pes = world.num_pes();
             let _my_pe = world.my_pe();
diff --git a/tests/array/atomic_ops/compare_exchange_test.rs b/tests/array/atomic_ops/compare_exchange_test.rs
index 8c5ad445..d3047250 100644
--- a/tests/array/atomic_ops/compare_exchange_test.rs
+++ b/tests/array/atomic_ops/compare_exchange_test.rs
@@ -405,7 +405,7 @@ macro_rules! check_input {
 macro_rules! input_test{
     ($array:ident,  $len:expr, $dist:ident) =>{
        {
-            std::env::set_var("LAMELLAR_OP_BATCH","10");
+            std::env::set_var("LAMELLAR_BATCH_OP_SIZE","10");
             let world = lamellar::LamellarWorldBuilder::new().build();
             let num_pes = world.num_pes();
             let my_pe = world.my_pe();

From d8b261b445d376d82ca9d65b92abd308cf6049fa Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Apr 2024 14:58:54 -0700
Subject: [PATCH 025/116] more serde_byte optimizations

---
 src/array/global_lock_atomic/rdma.rs | 1 +
 src/array/local_lock_atomic/rdma.rs  | 1 +
 src/array/unsafe/rdma.rs             | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index 2d257770..924abe7b 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -338,6 +338,7 @@ struct GlobalLockRemoteSmallPutAm {
     array: GlobalLockByteArray, //inner of the indices we need to place data into
     start_index: usize,
     len: usize,
+    #[serde(with = "serde_bytes")]
     data: Vec<u8>,
 }
 
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index 9643ddab..cc5d7bc8 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -295,6 +295,7 @@ struct LocalLockRemotePutAm {
     array: LocalLockByteArray, //inner of the indices we need to place data into
     start_index: usize,
     len: usize,
+    #[serde(with = "serde_bytes")]
     data: Vec<u8>,
 }
 
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index e43d9956..a3fa7677 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -1158,6 +1158,7 @@ struct UnsafeSmallPutAm {
     array: UnsafeByteArray, //byte representation of the array
     start_index: usize,     //index with respect to inner (of type T)
     len: usize,             //len of buf (with respect to original type T)
+    #[serde(with = "serde_bytes")]
     data: Vec<u8>, //change this to an enum which is a vector or OneSidedMemoryRegion depending on data size
                    // pe: usize,
 }

From 95cb2b2060f852450798a5897bd6295af98b1bc9 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Apr 2024 15:02:03 -0700
Subject: [PATCH 026/116] replace Pin<Box<(dyn Future...>> from array ops with
 concrete types

---
 src/array/operations.rs                  | 478 +----------------------
 src/array/operations/access.rs           |  31 +-
 src/array/operations/arithmetic.rs       | 118 +++---
 src/array/operations/bitwise.rs          |  73 ++--
 src/array/operations/compare_exchange.rs |  45 ++-
 src/array/operations/handle.rs           | 284 ++++++++++++++
 src/array/operations/read_only.rs        |  24 +-
 src/array/operations/shift.rs            |  50 +--
 8 files changed, 466 insertions(+), 637 deletions(-)
 create mode 100644 src/array/operations/handle.rs

diff --git a/src/array/operations.rs b/src/array/operations.rs
index a9fd71ca..d595c763 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -6,10 +6,13 @@ use crate::array::local_lock_atomic::*;
 use crate::array::native_atomic::*;
 use crate::array::{AmDist, Dist, LamellarEnv, LamellarWriteArray};
 use crate::config;
+
 // use crate::lamellar_request::LamellarRequest;
 // use crate::scheduler::Scheduler;
 // use crate::LamellarTeamRT;
 
+pub(crate) mod handle;
+pub use handle::{ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayResultBatchOpHandle};
 pub(crate) mod access;
 pub use access::{AccessOps, LocalAtomicOps};
 pub(crate) mod arithmetic;
@@ -937,483 +940,8 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for NativeAtomicLocalData<T> {
     }
 }
 
-// #[doc(hidden)]
-// pub trait BufferOp: Sync + Send {
-//     fn add_ops(
-//         &self,
-//         op: *const u8,
-//         op_data: *const u8,
-//         team: Pin<Arc<LamellarTeamRT>>,
-//     ) -> (bool, Arc<AtomicBool>);
-//     fn add_fetch_ops(
-//         &self,
-//         pe: usize,
-//         op: *const u8,
-//         op_data: *const u8,
-//         req_ids: &Vec<usize>,
-//         res_map: OpResults,
-//         team: Pin<Arc<LamellarTeamRT>>,
-//     ) -> (bool, Arc<AtomicBool>, Option<OpResultOffsets>);
-
-//     fn into_arc_am(
-//         &self,
-//         pe: usize,
-//         sub_array: std::ops::Range<usize>,
-//     ) -> (
-//         Vec<LamellarArcAm>,
-//         usize,
-//         Arc<AtomicBool>,
-//         Arc<Mutex<Vec<u8>>>,
-//     );
-// }
-
-// #[doc(hidden)]
-// pub type OpResultOffsets = Vec<(usize, usize, usize)>; //reqid,offset,len
-
-// #[doc(hidden)]
-// pub struct OpReqOffsets(Arc<Mutex<HashMap<usize, OpResultOffsets>>>); //pe
-// impl OpReqOffsets {
-//     //#[tracing::instrument(skip_all)]
-//     // pub(crate) fn new() -> Self {
-//     //     OpReqOffsets(Arc::new(Mutex::new(HashMap::new())))
-//     // }
-//     //#[tracing::instrument(skip_all)]
-//     pub fn insert(&self, index: usize, indices: OpResultOffsets) {
-//         let mut map = self.0.lock();
-//         map.insert(index, indices);
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     pub(crate) fn lock(&self) -> parking_lot::MutexGuard<HashMap<usize, OpResultOffsets>> {
-//         self.0.lock()
-//     }
-// }
-
-// impl Clone for OpReqOffsets {
-//     fn clone(&self) -> Self {
-//         OpReqOffsets(self.0.clone())
-//     }
-// }
-
-// impl std::fmt::Debug for OpReqOffsets {
-//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//         let map = self.0.lock();
-//         write!(f, "{:?} {:?}", map.len(), map)
-//     }
-// }
-
-// #[doc(hidden)]
-// pub type PeOpResults = Arc<Mutex<Vec<u8>>>;
-
-// #[doc(hidden)]
-// pub struct OpResults(Arc<Mutex<HashMap<usize, PeOpResults>>>);
-// impl OpResults {
-//     //#[tracing::instrument(skip_all)]
-//     // pub(crate) fn new() -> Self {
-//     //     OpResults(Arc::new(Mutex::new(HashMap::new())))
-//     // }
-//     //#[tracing::instrument(skip_all)]
-//     pub fn insert(&self, index: usize, val: PeOpResults) {
-//         let mut map = self.0.lock();
-//         map.insert(index, val);
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     pub(crate) fn lock(&self) -> parking_lot::MutexGuard<HashMap<usize, PeOpResults>> {
-//         self.0.lock()
-//     }
-// }
-
-// impl Clone for OpResults {
-//     fn clone(&self) -> Self {
-//         OpResults(self.0.clone())
-//     }
-// }
-
-// impl std::fmt::Debug for OpResults {
-//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//         let map = self.0.lock();
-//         write!(f, "{:?} {:?}", map.len(), map)
-//     }
-// }
-
-// pub(crate) struct ArrayOpHandle {
-//     pub(crate) reqs: Vec<Box<ArrayOpHandleInner>>,
-// }
-
-// #[derive(Debug)]
-// pub(crate) struct ArrayOpHandleInner {
-//     pub(crate) complete: Vec<Arc<AtomicBool>>,
-//     pub(crate) scheduler: Arc<Scheduler>,
-// }
-
-// pub(crate) struct ArrayOpFetchHandle<T: Dist> {
-//     pub(crate) req: Box<ArrayOpFetchHandleInner<T>>,
-// }
-
-// pub(crate) struct ArrayOpBatchFetchHandle<T: Dist> {
-//     pub(crate) reqs: Vec<Box<ArrayOpFetchHandleInner<T>>>,
-// }
-
-// #[derive(Debug)]
-// pub(crate) struct ArrayOpFetchHandleInner<T: Dist> {
-//     pub(crate) indices: OpReqOffsets,
-//     pub(crate) complete: Vec<Arc<AtomicBool>>,
-//     pub(crate) results: OpResults,
-//     pub(crate) req_cnt: usize,
-//     pub(crate) scheduler: Arc<Scheduler>,
-//     pub(crate) _phantom: PhantomData<T>,
-// }
-
-// pub(crate) struct ArrayOpResultHandle<T: Dist> {
-//     pub(crate) req: Box<ArrayOpResultHandleInner<T>>,
-// }
-// pub(crate) struct ArrayOpBatchResultHandle<T: Dist> {
-//     pub(crate) reqs: Vec<Box<ArrayOpResultHandleInner<T>>>,
-// }
-
-// #[derive(Debug)]
-// pub(crate) struct ArrayOpResultHandleInner<T> {
-//     pub(crate) indices: OpReqOffsets,
-//     pub(crate) complete: Vec<Arc<AtomicBool>>,
-//     pub(crate) results: OpResults,
-//     pub(crate) req_cnt: usize,
-//     pub(crate) scheduler: Arc<Scheduler>,
-//     pub(crate) _phantom: PhantomData<T>,
-// }
-
-// #[async_trait]
-// impl LamellarRequest for ArrayOpHandle {
-//     type Output = ();
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         for req in self.reqs.drain(..) {
-//             req.await;
-//         }
-//         ()
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         for req in &self.reqs {
-//             req.blocking_wait();
-//         }
-//         ()
-//     }
-//     fn ready(&self) -> bool {
-//         self.reqs.iter().all(|req| req.ready())
-//     }
-//     fn set_waker(&mut self, waker: futures_util::task::Waker) {
-//         for req in &mut self.reqs {
-//             req.set_waker(waker);
-//         }
-//     }
-// }
-
-// #[async_trait]
-// impl LamellarRequest for ArrayOpHandleInner {
-//     type Output = ();
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         for comp in self.complete {
-//             while comp.load(Ordering::Relaxed) == false {
-//                 async_std::task::yield_now().await;
-//             }
-//         }
-//         ()
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         for comp in &self.complete {
-//             while comp.load(Ordering::Relaxed) == false {
-//                 // std::thread::yield_now();
-//                 self.scheduler.exec_task();
-//             }
-//         }
-//         ()
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.complete
-//             .iter()
-//             .all(|comp| comp.load(Ordering::Relaxed))
-//     }
-
-//     fn set_waker(&mut self, waker: futures_util::task::Waker) {
-//         self.complete.iter()
-// }
-
-// #[async_trait]
-// impl<T: Dist> LamellarRequest for ArrayOpFetchHandle<T> {
-//     type Output = T;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         self.req
-//
-//             .await
-//             .pop()
-//             .expect("should have a single request")
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         self.req.blocking_wait().pop().expect("should have a single request")
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.req.ready()
-//     }
-// }
-
-// #[async_trait]
-// impl<T: Dist> LamellarRequest for ArrayOpBatchFetchHandle<T> {
-//     type Output = Vec<T>;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         let mut res = vec![];
-//         for req in self.reqs.drain(..) {
-//             res.extend(req.await);
-//         }
-//         res
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         let mut res = vec![];
-//         for req in &self.reqs {
-//             res.extend(req.blocking_wait());
-//         }
-//         // println!("res: {:?}",res);
-//         res
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.reqs.iter().all(|req| req.ready())
-//     }
-// }
-
-// impl<T: Dist> ArrayOpFetchHandleInner<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn get_result(&self) -> Vec<T> {
-//         if self.req_cnt > 0 {
-//             let mut res_vec = Vec::with_capacity(self.req_cnt);
-//             unsafe {
-//                 res_vec.set_len(self.req_cnt);
-//             }
-//             // println!("req_cnt: {:?}", self.req_cnt);
-
-//             for (pe, res) in self.results.lock().iter() {
-//                 let res = res.lock();
-//                 for (rid, offset, len) in self.indices.lock().get(pe).unwrap().iter() {
-//                     let len = *len;
-//                     if len == std::mem::size_of::<T>() + 1 {
-//                         panic!(
-//                             "unexpected results len {:?} {:?}",
-//                             len,
-//                             std::mem::size_of::<T>() + 1
-//                         );
-//                     }
-//                     let res_t = unsafe {
-//                         std::slice::from_raw_parts(
-//                             res.as_ptr().offset(*offset as isize) as *const T,
-//                             len / std::mem::size_of::<T>(),
-//                         )
-//                     };
-//                     // println!("rid {:?} offset {:?} len {:?} {:?}",rid,offset,len,res.len());
-//                     // println!("res {:?} {:?}",res.len(),&res[offset..offset+len]);
-//                     // println!("res {:?} {:?}",res_t,res_t.len());
-//                     res_vec[*rid] = res_t[0];
-//                 }
-//             }
-//             res_vec
-//         } else {
-//             vec![]
-//         }
-//     }
-// }
-
-// #[async_trait]
-// impl<T: Dist> LamellarRequest for ArrayOpFetchHandleInner<T> {
-//     type Output = Vec<T>;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         for comp in &self.complete {
-//             while comp.load(Ordering::Relaxed) == false {
-//                 async_std::task::yield_now().await;
-//             }
-//         }
-//         self.get_result()
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         for comp in &self.complete {
-//             while comp.load(Ordering::Relaxed) == false {
-//                 // std::thread::yield_now();
-//                 self.scheduler.exec_task();
-//             }
-//         }
-//         self.get_result()
-//     }
-//     fn ready(&self) -> bool {
-//         self.complete
-//             .iter()
-//             .all(|comp| comp.load(Ordering::Relaxed))
-//     }
-// }
-
-// #[async_trait]
-// impl<T: Dist> LamellarRequest for ArrayOpResultHandle<T> {
-//     type Output = Result<T, T>;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         self.req
-//
-//             .await
-//             .pop()
-//             .expect("should have a single request")
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         self.req.blocking_wait().pop().expect("should have a single request")
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.req.ready()
-//     }
-// }
-
-// #[async_trait]
-// impl<T: Dist> LamellarRequest for ArrayOpBatchResultHandle<T> {
-//     type Output = Vec<Result<T, T>>;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         // println!("num_reqs: {}",self.reqs.len());
-//         let mut res = vec![];
-//         for req in self.reqs.drain(..) {
-//             res.extend(req.await);
-//         }
-//         res
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         let mut res = vec![];
-//         for req in &self.reqs {
-//             res.extend(req.blocking_wait());
-//         }
-//         res
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.reqs.iter().all(|req| req.ready())
-//     }
-// }
-
-// impl<T: Dist> ArrayOpResultHandleInner<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn get_result(&self) -> Vec<Result<T, T>> {
-//         // println!("req_cnt: {:?}", self.req_cnt);
-//         if self.req_cnt > 0 {
-//             let mut res_vec = Vec::with_capacity(self.req_cnt);
-//             unsafe {
-//                 res_vec.set_len(self.req_cnt);
-//             }
-
-//             for (pe, res) in self.results.lock().iter() {
-//                 let res = res.lock();
-//                 // println!("{pe} {:?}",res.len());
-//                 // let mut rids = std::collections::HashSet::new();
-//                 let res_offsets_lock = self.indices.lock();
-//                 let res_offsets = res_offsets_lock.get(pe).unwrap();
-//                 // println!("{pe} {:?} {:?}",res_offsets[0],res_offsets.last());
-//                 for (rid, offset, len) in res_offsets.iter() {
-//                     // if rids.contains(rid){
-//                     //     println!("uhhh ohhhhh not sure this should be possible {:?}",rid);
-//                     // }
-//                     // else{
-//                     //     rids.insert(rid);
-//                     // }
-//                     let ok: bool;
-//                     let mut offset = *offset;
-//                     let mut len = *len;
-//                     if len == std::mem::size_of::<T>() + 1 {
-//                         ok = res[offset] == 0;
-//                         offset += 1;
-//                         len -= 1;
-//                     } else {
-//                         panic!(
-//                             "unexpected results len {:?} {:?}",
-//                             len,
-//                             std::mem::size_of::<T>() + 1
-//                         );
-//                     };
-//                     let res_t = unsafe {
-//                         std::slice::from_raw_parts(
-//                             res.as_ptr().offset(offset as isize) as *const T,
-//                             len / std::mem::size_of::<T>(),
-//                         )
-//                     };
-
-//                     if ok {
-//                         res_vec[*rid] = Ok(res_t[0]);
-//                     } else {
-//                         res_vec[*rid] = Err(res_t[0]);
-//                     }
-//                 }
-//             }
-//             res_vec
-//         } else {
-//             vec![]
-//         }
-//     }
-// }
-
-// #[async_trait]
-// impl<T: Dist> LamellarRequest for ArrayOpResultHandleInner<T> {
-//     type Output = Vec<Result<T, T>>;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         // println!("comp size: {}",self.complete.len());
-//         for comp in &self.complete {
-//             while comp.load(Ordering::Relaxed) == false {
-//                 async_std::task::yield_now().await;
-//             }
-//         }
-//         self.get_result()
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Self::Output {
-//         for comp in &self.complete {
-//             while comp.load(Ordering::Relaxed) == false {
-//                 // std::thread::yield_now();
-//                 self.scheduler.exec_task();
-//             }
-//         }
-//         self.get_result()
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.complete
-//             .iter()
-//             .all(|comp| comp.load(Ordering::Relaxed))
-//     }
-// }
-
 /// Supertrait specifying that array elements must be [Sized] and must be able to be used in remote operations [Dist].
 pub trait ElementOps: Dist + Sized {}
 impl<T> ElementOps for T where T: Dist {}
 
-// #[doc(hidden)]
-// pub struct LocalOpResult<T: Dist> {
-//     val: T,
-// }
-
-// #[async_trait]
-// impl<T: Dist> LamellarArrayRequest for LocalOpResult<T> {
-//     type Output = T;
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         self.val
-//     }
-//     fn wait(self: Box<Self>) -> Self::Output {
-//         self.val
-//     }
-//     fn ready(&self) -> bool {
-//         true
-//     }
-// }
-
 impl<T: ElementArithmeticOps> ArithmeticOps<T> for LamellarWriteArray<T> {}
diff --git a/src/array/operations/access.rs b/src/array/operations/access.rs
index 99e79556..1aac1b00 100644
--- a/src/array/operations/access.rs
+++ b/src/array/operations/access.rs
@@ -1,5 +1,9 @@
 use crate::array::*;
 
+use super::handle::{
+    ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayFetchOpHandle, ArrayOpHandle,
+};
+
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for remotely writing elements
 ///
@@ -89,13 +93,10 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn store<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
-        self.inner_array().initiate_batch_op(
-            val,
-            index,
-            ArrayOpCmd::Store,
-            self.as_lamellar_byte_array(),
-        )
+    fn store<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array()
+            .initiate_batch_op(val, index, ArrayOpCmd::Store, self.as_lamellar_byte_array())
+            .into()
     }
 
     /// This call performs a batched vesion of the [store][AccessOps::store] function,
@@ -128,7 +129,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -162,14 +163,10 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn swap<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::Swap,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn swap<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(val, index, ArrayOpCmd::Swap, self.as_lamellar_byte_array())
+            .into()
     }
 
     /// This call performs a batched vesion of the [swap][AccessOps::swap] function,
@@ -203,7 +200,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
diff --git a/src/array/operations/arithmetic.rs b/src/array/operations/arithmetic.rs
index 2f0e8c39..720725d7 100644
--- a/src/array/operations/arithmetic.rs
+++ b/src/array/operations/arithmetic.rs
@@ -1,5 +1,8 @@
 use crate::array::*;
 
+use super::handle::{
+    ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayFetchOpHandle, ArrayOpHandle,
+};
 /// Supertrait specifying elements of the array support remote arithmetic assign operations
 /// - Addition ```+=```
 /// - Subtraction ```-=```
@@ -118,7 +121,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn add(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn add(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -157,7 +160,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::Add)
         self.inner_array().initiate_batch_op(
             val,
@@ -192,14 +195,15 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_add(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchAdd,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_add(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchAdd,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_add][ArithmeticOps::fetch_add] function,
@@ -233,7 +237,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         // println!("here in batch_fetch_add");
         self.inner_array().initiate_batch_fetch_op_2(
             val,
@@ -267,7 +271,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn sub<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn sub<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -306,7 +310,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::Sub)
         self.inner_array().initiate_batch_op(
             val,
@@ -341,14 +345,15 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_sub<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchSub,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_sub<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchSub,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_sub][ArithmeticOps::fetch_sub] function,
@@ -382,7 +387,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
@@ -415,7 +420,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn mul<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn mul<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -454,7 +459,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::Mul)
         self.inner_array().initiate_batch_op(
             val,
@@ -489,14 +494,15 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_mul<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchMul,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_mul<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchMul,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_mul][ArithmeticOps::fetch_mul] function,
@@ -530,7 +536,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
@@ -563,7 +569,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn div<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn div<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -602,7 +608,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::Div)
         self.inner_array().initiate_batch_op(
             val,
@@ -637,14 +643,15 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_div<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchDiv,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_div<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchDiv,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_div][ArithmeticOps::fetch_div] function,
@@ -678,7 +685,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
@@ -711,7 +718,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn rem<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn rem<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -750,7 +757,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::Rem)
         self.inner_array().initiate_batch_op(
             val,
@@ -785,14 +792,15 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_rem<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchRem,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_rem<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchRem,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_rem][ArithmeticOps::fetch_rem] function,
@@ -826,7 +834,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
diff --git a/src/array/operations/bitwise.rs b/src/array/operations/bitwise.rs
index 08130c41..7e78d679 100644
--- a/src/array/operations/bitwise.rs
+++ b/src/array/operations/bitwise.rs
@@ -1,4 +1,8 @@
 use crate::array::*;
+
+use super::handle::{
+    ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayFetchOpHandle, ArrayOpHandle,
+};
 /// Supertrait specifying elements of the array support remote bitwise operations
 /// - And ```&```
 /// - Or ```|```
@@ -104,7 +108,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn bit_and<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn bit_and<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -143,7 +147,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::And)
         self.inner_array().initiate_batch_op(
             val,
@@ -178,14 +182,15 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_bit_and<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchAnd,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_bit_and<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchAnd,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_bit_and][BitWiseOps::fetch_bit_and] function,
@@ -219,7 +224,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
@@ -252,7 +257,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn bit_or<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn bit_or<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -291,7 +296,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::Or)
         self.inner_array().initiate_batch_op(
             val,
@@ -326,14 +331,15 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_bit_or<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchOr,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_bit_or<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchOr,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_bit_or][BitWiseOps::fetch_bit_or] function,
@@ -367,7 +373,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
@@ -400,7 +406,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn bit_xor<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn bit_xor<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -439,7 +445,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_op(val, index, ArrayOpCmd::Xor)
         self.inner_array().initiate_batch_op(
             val,
@@ -474,14 +480,15 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_bit_xor<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchXor,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_bit_xor<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchXor,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_bit_xor][BitWiseOps::fetch_bit_xor] function,
@@ -515,7 +522,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
diff --git a/src/array/operations/compare_exchange.rs b/src/array/operations/compare_exchange.rs
index fbc0c82b..ce3a5edb 100644
--- a/src/array/operations/compare_exchange.rs
+++ b/src/array/operations/compare_exchange.rs
@@ -1,5 +1,7 @@
 use crate::array::*;
 
+use super::handle::{ArrayResultBatchOpHandle, ArrayResultOpHandle};
+
 /// Supertrait specifying elements of the array support remote Equality operations
 /// - ```==```
 /// - ```!=```
@@ -119,19 +121,15 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
     /// let result = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn compare_exchange<'a>(
-        &self,
-        index: usize,
-        current: T,
-        new: T,
-    ) -> Pin<Box<dyn Future<Output = Result<T, T>> + Send>> {
-        let result = self.inner_array().initiate_batch_result_op_2(
-            new,
-            index,
-            ArrayOpCmd::CompareExchange(current),
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn compare_exchange<'a>(&self, index: usize, current: T, new: T) -> ArrayResultOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_result_op_2(
+                new,
+                index,
+                ArrayOpCmd::CompareExchange(current),
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [compare_exchange][CompareExchangeOps::compare_exchange] function,
@@ -168,7 +166,7 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
         index: impl OpInput<'a, usize>,
         current: T,
         new: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<Result<T, T>>> + Send>> {
+    ) -> ArrayResultBatchOpHandle<T> {
         self.inner_array().initiate_batch_result_op_2(
             new,
             index,
@@ -294,14 +292,15 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
         current: T,
         new: T,
         eps: T,
-    ) -> Pin<Box<dyn Future<Output = Result<T, T>> + Send>> {
-        let result = self.inner_array().initiate_batch_result_op_2(
-            new,
-            index,
-            ArrayOpCmd::CompareExchangeEps(current, eps),
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    ) -> ArrayResultOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_result_op_2(
+                new,
+                index,
+                ArrayOpCmd::CompareExchangeEps(current, eps),
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [compare_exchange_epsilon][CompareExchangeEpsilonOps::compare_exchange_epsilon] function,
@@ -340,7 +339,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
         current: T,
         new: impl OpInput<'a, T>,
         eps: T,
-    ) -> Pin<Box<dyn Future<Output = Vec<Result<T, T>>> + Send>> {
+    ) -> ArrayResultBatchOpHandle<T> {
         self.inner_array().initiate_batch_result_op_2(
             new,
             index,
diff --git a/src/array/operations/handle.rs b/src/array/operations/handle.rs
new file mode 100644
index 00000000..855cbe89
--- /dev/null
+++ b/src/array/operations/handle.rs
@@ -0,0 +1,284 @@
+use crate::{
+    array::{AmDist, LamellarByteArray},
+    lamellar_request::LamellarRequest,
+    AmHandle,
+};
+
+use std::{
+    collections::VecDeque,
+    future::Future,
+    pin::Pin,
+    task::{Context, Poll, Waker},
+};
+
+use pin_project::pin_project;
+
+pub struct ArrayBatchOpHandle {
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) reqs: VecDeque<(AmHandle<()>, Vec<usize>)>,
+}
+
+pub type ArrayOpHandle = ArrayBatchOpHandle;
+
+impl LamellarRequest for ArrayBatchOpHandle {
+    fn blocking_wait(mut self) -> Self::Output {
+        for req in self.reqs.drain(0..) {
+            req.0.blocking_wait();
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let mut ready = true;
+        for req in self.reqs.iter_mut() {
+            ready &= req.0.ready_or_set_waker(waker);
+        }
+        ready
+    }
+    fn val(&self) -> Self::Output {
+        for req in self.reqs.iter() {
+            req.0.val();
+        }
+    }
+}
+
+impl Future for ArrayBatchOpHandle {
+    type Output = ();
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        while let Some(mut req) = self.reqs.pop_front() {
+            if !req.0.ready_or_set_waker(cx.waker()) {
+                self.reqs.push_front(req);
+                return Poll::Pending;
+            }
+        }
+        Poll::Ready(())
+    }
+}
+
+pub struct ArrayFetchOpHandle<R: AmDist> {
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) req: AmHandle<Vec<R>>,
+}
+
+impl<R: AmDist> LamellarRequest for ArrayFetchOpHandle<R> {
+    fn blocking_wait(mut self) -> Self::Output {
+        self.req
+            .blocking_wait()
+            .pop()
+            .expect("should have a single request")
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.req.ready_or_set_waker(waker)
+    }
+    fn val(&self) -> Self::Output {
+        self.req.val().pop().expect("should have a single request")
+    }
+}
+
+impl<R: AmDist> Future for ArrayFetchOpHandle<R> {
+    type Output = R;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if self.req.ready_or_set_waker(cx.waker()) {
+            return Poll::Ready(self.req.val().pop().expect("should have a single request"));
+        }
+        Poll::Pending
+    }
+}
+
+#[pin_project]
+pub struct ArrayFetchBatchOpHandle<R: AmDist> {
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) reqs: VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>,
+    results: Vec<R>,
+}
+
+impl<R: AmDist> From<ArrayFetchBatchOpHandle<R>> for ArrayFetchOpHandle<R> {
+    fn from(mut req: ArrayFetchBatchOpHandle<R>) -> Self {
+        Self {
+            _array: req._array,
+            req: req.reqs.pop_front().unwrap().0,
+        }
+    }
+}
+
+impl<R: AmDist> ArrayFetchBatchOpHandle<R> {
+    pub(crate) fn new(
+        array: LamellarByteArray,
+        reqs: VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>,
+        max_index: usize,
+    ) -> Self {
+        let mut results = Vec::with_capacity(max_index);
+        unsafe {
+            results.set_len(max_index);
+        }
+        Self {
+            _array: array,
+            reqs,
+            results,
+        }
+    }
+}
+
+impl<R: AmDist> LamellarRequest for ArrayFetchBatchOpHandle<R> {
+    fn blocking_wait(mut self) -> Self::Output {
+        for req in self.reqs.drain(0..) {
+            let mut res = req.0.blocking_wait();
+            for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                self.results[*idx] = val;
+            }
+        }
+        std::mem::take(&mut self.results)
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let mut ready = true;
+        for req in self.reqs.iter_mut() {
+            ready &= req.0.ready_or_set_waker(waker);
+        }
+        ready
+    }
+    fn val(&self) -> Self::Output {
+        let mut results = Vec::with_capacity(self.results.len());
+        unsafe {
+            results.set_len(self.results.len());
+        }
+        for req in &self.reqs {
+            let mut res = req.0.val();
+            for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                results[*idx] = val;
+            }
+        }
+        results
+    }
+}
+
+impl<R: AmDist> Future for ArrayFetchBatchOpHandle<R> {
+    type Output = Vec<R>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        while let Some(mut req) = this.reqs.pop_front() {
+            if !req.0.ready_or_set_waker(cx.waker()) {
+                this.reqs.push_front(req);
+                return Poll::Pending;
+            } else {
+                let mut res = req.0.val();
+                for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                    this.results[*idx] = val;
+                }
+            }
+        }
+        Poll::Ready(std::mem::take(&mut this.results))
+    }
+}
+
+pub struct ArrayResultOpHandle<R: AmDist> {
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) req: AmHandle<Vec<Result<R, R>>>,
+}
+
+impl<R: AmDist> LamellarRequest for ArrayResultOpHandle<R> {
+    fn blocking_wait(self) -> Self::Output {
+        self.req
+            .blocking_wait()
+            .pop()
+            .expect("should have a single request")
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.req.ready_or_set_waker(waker)
+    }
+    fn val(&self) -> Self::Output {
+        self.req.val().pop().expect("should have a single request")
+    }
+}
+
+impl<R: AmDist> Future for ArrayResultOpHandle<R> {
+    type Output = Result<R, R>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if self.req.ready_or_set_waker(cx.waker()) {
+            return Poll::Ready(self.req.val().pop().expect("should have a single request"));
+        }
+        Poll::Pending
+    }
+}
+
+#[pin_project]
+pub struct ArrayResultBatchOpHandle<R: AmDist> {
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) reqs: VecDeque<(AmHandle<Vec<Result<R, R>>>, Vec<usize>)>,
+    results: Vec<Result<R, R>>,
+}
+
+impl<R: AmDist> From<ArrayResultBatchOpHandle<R>> for ArrayResultOpHandle<R> {
+    fn from(mut req: ArrayResultBatchOpHandle<R>) -> Self {
+        Self {
+            _array: req._array,
+            req: req.reqs.pop_front().unwrap().0,
+        }
+    }
+}
+
+impl<R: AmDist> ArrayResultBatchOpHandle<R> {
+    pub(crate) fn new(
+        array: LamellarByteArray,
+        reqs: VecDeque<(AmHandle<Vec<Result<R, R>>>, Vec<usize>)>,
+        max_index: usize,
+    ) -> Self {
+        let mut results = Vec::with_capacity(max_index);
+        unsafe {
+            results.set_len(max_index);
+        }
+        Self {
+            _array: array,
+            reqs,
+            results,
+        }
+    }
+}
+
+impl<R: AmDist> LamellarRequest for ArrayResultBatchOpHandle<R> {
+    fn blocking_wait(mut self) -> Self::Output {
+        for req in self.reqs.drain(0..) {
+            let mut res = req.0.blocking_wait();
+            for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                self.results[*idx] = val;
+            }
+        }
+        std::mem::take(&mut self.results)
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        let mut ready = true;
+        for req in self.reqs.iter_mut() {
+            ready &= req.0.ready_or_set_waker(waker);
+        }
+        ready
+    }
+    fn val(&self) -> Self::Output {
+        let mut results = Vec::with_capacity(self.results.len());
+        unsafe {
+            results.set_len(self.results.len());
+        }
+        for req in &self.reqs {
+            let mut res = req.0.val();
+            for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                results[*idx] = val;
+            }
+        }
+        results
+    }
+}
+
+impl<R: AmDist> Future for ArrayResultBatchOpHandle<R> {
+    type Output = Vec<Result<R, R>>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        while let Some(mut req) = this.reqs.pop_front() {
+            if !req.0.ready_or_set_waker(cx.waker()) {
+                this.reqs.push_front(req);
+                return Poll::Pending;
+            } else {
+                let mut res = req.0.val();
+                for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                    this.results[*idx] = val;
+                }
+            }
+        }
+        Poll::Ready(std::mem::take(&mut this.results))
+    }
+}
diff --git a/src/array/operations/read_only.rs b/src/array/operations/read_only.rs
index bb4b3e68..ac2f7dc1 100644
--- a/src/array/operations/read_only.rs
+++ b/src/array/operations/read_only.rs
@@ -1,5 +1,7 @@
 use crate::array::*;
 
+use super::handle::{ArrayFetchBatchOpHandle, ArrayFetchOpHandle};
+
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for remotely reading elements
 ///
@@ -68,16 +70,17 @@ pub trait ReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// let val = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn load<'a>(&self, index: usize) -> Pin<Box<dyn Future<Output = T> + Send>> {
+    fn load<'a>(&self, index: usize) -> ArrayFetchOpHandle<T> {
         let dummy_val = self.inner_array().dummy_val(); //we dont actually do anything with this except satisfy apis;
                                                         // let array = self.inner_array();
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            dummy_val,
-            index,
-            ArrayOpCmd::Load,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                dummy_val,
+                index,
+                ArrayOpCmd::Load,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [load][ReadOnlyOps::load] function,
@@ -109,10 +112,7 @@ pub trait ReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// assert_eq!(vals.len(),indices.len());
     ///```
     //#[tracing::instrument(skip_all)]
-    fn batch_load<'a>(
-        &self,
-        index: impl OpInput<'a, usize>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    fn batch_load<'a>(&self, index: impl OpInput<'a, usize>) -> ArrayFetchBatchOpHandle<T> {
         let dummy_val = self.inner_array().dummy_val(); //we dont actually do anything with this except satisfy apis;
         self.inner_array().initiate_batch_fetch_op_2(
             dummy_val,
diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index e724e030..be61a1a0 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -1,4 +1,8 @@
 use crate::array::*;
+
+use super::handle::{
+    ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayFetchOpHandle, ArrayOpHandle,
+};
 /// Supertrait specifying elements of the array support remote Shift operations
 /// - Left ```<<```
 /// - Right ```>>```
@@ -71,7 +75,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn shl(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn shl(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -110,7 +114,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_batch_op(val, index, ArrayOpCmd::Shl)
         self.inner_array().initiate_batch_op(
             val,
@@ -145,14 +149,15 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_shl(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchShl,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_shl(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchShl,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_shl][ShiftOps::fetch_shl] function,
@@ -186,7 +191,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
@@ -219,7 +224,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn shr<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    fn shr<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
         self.inner_array().initiate_batch_op(
             val,
             index,
@@ -258,7 +263,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         // self.inner_array().initiate_batch_op(val, index, ArrayOpCmd::Shr)
         self.inner_array().initiate_batch_op(
             val,
@@ -293,14 +298,15 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// let old = array.block_on(req);
     ///```
     //#[tracing::instrument(skip_all)]
-    fn fetch_shr<'a>(&self, index: usize, val: T) -> Pin<Box<dyn Future<Output = T> + Send>> {
-        let result = self.inner_array().initiate_batch_fetch_op_2(
-            val,
-            index,
-            ArrayOpCmd::FetchShr,
-            self.as_lamellar_byte_array(),
-        );
-        Box::pin(async move { result.await[0] })
+    fn fetch_shr<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchShr,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
     }
 
     /// This call performs a batched vesion of the [fetch_shr][ShiftOps::fetch_shr] function,
@@ -334,7 +340,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
         &self,
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,

From f1816760b11227e12e1ae8a925b17b6c8ca56c40 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Apr 2024 15:03:31 -0700
Subject: [PATCH 027/116] clean up warnings and debug stmts

---
 src/array/global_lock_atomic.rs |  6 +++---
 src/barrier.rs                  |  4 ++--
 src/darc.rs                     |  6 +++---
 src/lamellae/command_queues.rs  | 10 +++++-----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 325ee9ad..9486625e 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -113,7 +113,7 @@ pub struct GlobalLockCollectiveMutLocalData<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
     start_index: usize,
     end_index: usize,
-    lock_guard: GlobalRwDarcCollectiveWriteGuard<()>,
+    _lock_guard: GlobalRwDarcCollectiveWriteGuard<()>,
 }
 
 // impl<T: Dist> Drop for GlobalLockCollectiveMutLocalData<T>{
@@ -542,7 +542,7 @@ impl<T: Dist> GlobalLockArray<T> {
                 array: self_clone.clone(),
                 start_index: 0,
                 end_index: self_clone.array.num_elems_local(),
-                lock_guard: lock,
+                _lock_guard: lock,
             };
             // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
             data
@@ -577,7 +577,7 @@ impl<T: Dist> GlobalLockArray<T> {
             array: self.clone(),
             start_index: 0,
             end_index: self.array.num_elems_local(),
-            lock_guard: lock,
+            _lock_guard: lock,
         };
         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
         data
diff --git a/src/barrier.rs b/src/barrier.rs
index 3827f562..f637bd56 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -204,7 +204,7 @@ impl Barrier {
                                 //             .expect("Data should exist on PE")
                                 //     }
                                 // );
-                                println!("barrier put_slice 1");
+                                // println!("barrier put_slice 1");
                                 unsafe {
                                     self.barrier_buf[i - 1].put_slice(
                                         send_pe,
@@ -336,7 +336,7 @@ impl Barrier {
                                 //             .expect("Data should exist on PE")
                                 //     }
                                 // );
-                                println!("barrier put_slice 2");
+                                // println!("barrier put_slice 2");
                                 unsafe {
                                     self.barrier_buf[i - 1].put_slice(
                                         send_pe,
diff --git a/src/darc.rs b/src/darc.rs
index 119c6029..176b5d92 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -500,7 +500,7 @@ impl<T> DarcInner<T> {
                         //     inner.my_pe * std::mem::size_of::<usize>(),
                         //     inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>()
                         // );
-                        println!("darc block_on_outstanding put 1");
+                        // println!("darc block_on_outstanding put 1");
                         rdma.put(
                             send_pe,
                             ref_cnt_u8,
@@ -570,7 +570,7 @@ impl<T> DarcInner<T> {
                             std::mem::size_of::<usize>(),
                         )
                     };
-                    println!("darc block_on_outstanding put 2");
+                    // println!("darc block_on_outstanding put 2");
                     rdma.put(
                         send_pe,
                         barrier_id_slice,
@@ -620,7 +620,7 @@ impl<T> DarcInner<T> {
             };
             let rdma = &team.lamellae;
             for pe in team.arch.team_iter() {
-                println!("darc block_on_outstanding put 3");
+                // println!("darc block_on_outstanding put 3");
                 rdma.put(
                     pe,
                     &mode_refs[inner.my_pe..=inner.my_pe],
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 044334ee..989f830d 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -675,7 +675,7 @@ impl InnerCQ {
                     let recv_buffer = self.recv_buffer.lock();
                     // println! {"sending data to dst {:?} {:?} {:?} {:?}",recv_buffer[self.my_pe].as_addr()-self.comm.base_addr(),send_buf[dst],send_buf[dst].as_bytes(),send_buf};
                     // println!("sending cmd {:?}", send_buf);
-                    println!("Command Queue sending buffer");
+                    // println!("Command Queue sending buffer");
                     self.comm.put(
                         dst,
                         send_buf[dst].as_bytes(),
@@ -901,7 +901,7 @@ impl InnerCQ {
         // let cmd_buffer = self.cmd_buffers[dst].lock();
         // println!("sending release: {:?} cmd: {:?} {:?} {:?} 0x{:x} 0x{:x}",self.release_cmd,cmd,self.release_cmd.cmd_as_bytes(), cmd.cmd_as_bytes(),self.release_cmd.cmd_as_addr(),cmd.daddr + offset_of!(CmdMsg,cmd));
         let local_daddr = self.comm.local_addr(dst, cmd.daddr);
-        println!("sending release to {dst}");
+        // println!("sending release to {dst}");
         self.comm.put(
             dst,
             self.release_cmd.cmd_as_bytes(),
@@ -914,7 +914,7 @@ impl InnerCQ {
         // let cmd_buffer = self.cmd_buffers[dst].lock();
         // println!("sending release: {:?} cmd: {:?} {:?} {:?} 0x{:x} 0x{:x}",self.release_cmd,cmd,self.release_cmd.cmd_as_bytes(), cmd.cmd_as_bytes(),self.release_cmd.cmd_as_addr(),cmd.daddr + offset_of!(CmdMsg,cmd));
         let local_daddr = self.comm.local_addr(dst, cmd.daddr);
-        println!("sending free to {dst}");
+        // println!("sending free to {dst}");
         self.comm.put(
             dst,
             self.free_cmd.cmd_as_bytes(),
@@ -980,7 +980,7 @@ impl InnerCQ {
     //#[tracing::instrument(skip_all)]
     async fn get_data(&self, src: usize, cmd: CmdMsg, data_slice: &mut [u8]) {
         let local_daddr = self.comm.local_addr(src, cmd.daddr);
-        println!("command queue getting data from {src}");
+        // println!("command queue getting data from {src}");
         self.comm.iget(src, local_daddr as usize, data_slice);
         // self.get_amt.fetch_add(data_slice.len(),Ordering::Relaxed);
         let mut timer = std::time::Instant::now();
@@ -1009,7 +1009,7 @@ impl InnerCQ {
     async fn get_serialized_data(&self, src: usize, cmd: CmdMsg, ser_data: &SerializedData) {
         let data_slice = ser_data.header_and_data_as_bytes();
         let local_daddr = self.comm.local_addr(src, cmd.daddr);
-        println!("command queue getting serialized data from {src}");
+        // println!("command queue getting serialized data from {src}");
         self.comm.iget(src, local_daddr as usize, data_slice);
         // self.get_amt.fetch_add(data_slice.len(),Ordering::Relaxed);
         let mut timer = std::time::Instant::now();

From 5e942145ea74bc04eb644b7b8114d827825e8b60 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 22 Apr 2024 13:43:08 -0700
Subject: [PATCH 028/116] place more enviornments variables into module

---
 src/active_messaging.rs                       |  2 +-
 .../registered_active_message.rs              | 12 +++++-----
 src/array/global_lock_atomic/rdma.rs          |  3 ++-
 src/env_var.rs                                | 23 ++++++++++++++++++-
 src/lamellae/command_queues.rs                | 12 +++++-----
 5 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index fcf1d7a9..ca4dbc2f 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -661,7 +661,7 @@ pub(crate) mod batching;
 pub(crate) mod handle;
 pub use handle::*;
 
-pub(crate) const BATCH_AM_SIZE: usize = 100_000;
+// pub(crate) const BATCH_AM_SIZE: usize = 100_000;
 
 /// This macro is used to setup the attributed type so that it can be used within remote active messages.
 ///
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index 15a23040..10a20c79 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -1,10 +1,10 @@
 use crate::active_messaging::batching::{Batcher, BatcherType};
-use crate::active_messaging::*;
 use crate::lamellae::comm::AllocError;
 use crate::lamellae::{
     Backend, Des, Lamellae, LamellaeAM, LamellaeComm, LamellaeRDMA, Ser, SerializeHeader,
     SerializedData, SubData,
 };
+use crate::{active_messaging::*, config};
 
 use async_recursion::async_recursion;
 // use log::trace;
@@ -117,7 +117,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                     self.executor.submit_io_task(async move {
                         //spawn a task so that we can the execute the local am immediately
                         // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
-                        if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
+                        if am_size < config().batch_am_size && !immediate {
                             ame.batcher
                                 .add_remote_am_to_batch(
                                     req_data_clone.clone(),
@@ -155,7 +155,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 } else {
                     let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                     let am_size = am.serialized_size();
-                    if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
+                    if am_size < config().batch_am_size && !immediate {
                         self.batcher
                             .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                             .await;
@@ -179,7 +179,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 // println!("Am::Return");
                 let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                 let am_size = am.serialized_size();
-                if am_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
+                if am_size < config().batch_am_size && !immediate {
                     self.batcher
                         .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                         .await;
@@ -197,7 +197,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
             Am::Data(req_data, data) => {
                 // println!("Am::Data");
                 let data_size = data.serialized_size();
-                if data_size < crate::active_messaging::BATCH_AM_SIZE && !immediate {
+                if data_size < config().batch_am_size && !immediate {
                     self.batcher
                         .add_data_am_to_batch(req_data, data, data_size, stall_mark)
                         .await;
@@ -207,7 +207,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 }
             }
             Am::Unit(req_data) => {
-                if *UNIT_HEADER_LEN < crate::active_messaging::BATCH_AM_SIZE && !immediate {
+                if *UNIT_HEADER_LEN < config().batch_am_size && !immediate {
                     self.batcher
                         .add_unit_am_to_batch(req_data, stall_mark)
                         .await;
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index 924abe7b..d7621f89 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -5,6 +5,7 @@ use crate::array::{
     LamellarArrayInternalPut, LamellarArrayPut, LamellarArrayRdmaInput, LamellarArrayRdmaOutput,
     LamellarEnv, LamellarRead, LamellarWrite, TeamTryInto,
 };
+use crate::config;
 use crate::memregion::{
     AsBase, Dist, LamellarMemoryRegion, OneSidedMemoryRegion, RTMemoryRegionRDMA,
     RegisteredMemoryRegion, SubRegion,
@@ -215,7 +216,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             self.buf.len(),
                         ) {
                             let u8_buf_len = len * std::mem::size_of::<T>();
-                            if u8_buf_len > crate::active_messaging::BATCH_AM_SIZE {
+                            if u8_buf_len > config().batch_am_size {
                                 // println!("pe {:?} index: {:?} len {:?} buflen {:?} putting {:?}",pe,self.index,len, self.buf.len(),&u8_buf.as_slice().unwrap()[cur_index..(cur_index+u8_buf_len)]);
                                 let remote_am = GlobalLockRemotePutAm {
                                     array: self.array.clone().into(), //inner of the indices we need to place data into
diff --git a/src/env_var.rs b/src/env_var.rs
index 86eaf908..a67828e7 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -71,6 +71,18 @@ fn default_array_dynamic_index() -> IndexType {
     IndexType::Dynamic
 }
 
+fn default_cmd_buf_len() -> usize {
+    50000
+}
+
+fn default_cmd_buf_cnt() -> usize {
+    2
+}
+
+fn default_batch_am_size() -> usize {
+    100000
+}
+
 #[derive(Deserialize, Debug)]
 pub struct Config {
     #[serde(default = "default_deadlock_timeout")]
@@ -97,12 +109,21 @@ pub struct Config {
     pub alloc: Alloc,
     #[serde(default = "default_array_dynamic_index")]
     pub index_size: IndexType,
+    #[serde(default = "default_cmd_buf_len")]
+    pub cmd_buf_len: usize,
+    #[serde(default = "default_cmd_buf_cnt")]
+    pub cmd_buf_cnt: usize,
+    #[serde(default = "default_batch_am_size")]
+    pub batch_am_size: usize,
 }
 
 pub fn config() -> &'static Config {
     static CONFIG: OnceLock<Config> = OnceLock::new();
     CONFIG.get_or_init(|| match envy::prefixed("LAMELLAR_").from_env::<Config>() {
-        Ok(config) => config,
+        Ok(config) => {
+            println!("[LAMELLAR CONFIG]{config:?}");
+            config
+        }
         Err(error) => panic!("{}", error),
     })
 }
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 989f830d..f86640d9 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -15,7 +15,7 @@ use std::sync::Arc;
 
 //use tracing::*;
 
-const CMD_BUF_LEN: usize = 50000; // this is the number of slots for each PE
+// const CMD_BUF_LEN: usize = 50000; // this is the number of slots for each PE
                                   // const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
 const CMD_BUFS_PER_PE: usize = 2;
 
@@ -236,14 +236,14 @@ impl CmdMsgBuffer {
                     Box::from_raw(std::ptr::slice_from_raw_parts_mut(
                         // (*addr + base_addr) as *mut CmdMsg,
                         *addr as *mut CmdMsg,
-                        CMD_BUF_LEN,
+                        config().cmd_buf_len,
                     ))
                 },
                 addr: *addr,
                 // base_addr: base_addr,
                 index: 0,
                 allocated_cnt: 0,
-                max_size: CMD_BUF_LEN,
+                max_size: config().cmd_buf_len,
             });
         }
         CmdMsgBuffer {
@@ -1256,10 +1256,10 @@ impl CommandQueue {
         let mut cmd_buffers_addrs = vec![];
         for _pe in 0..num_pes {
             let mut addrs = vec![];
-            for _i in 0..CMD_BUFS_PER_PE {
+            for _i in 0..config().cmd_buf_cnt {
                 let addr = comm
                     .rt_alloc(
-                        CMD_BUF_LEN * std::mem::size_of::<CmdMsg>() + 1,
+                        config().cmd_buf_len * std::mem::size_of::<CmdMsg>() + 1,
                         std::mem::align_of::<CmdMsg>(),
                     )
                     .unwrap(); //+ comm.base_addr();
@@ -1516,7 +1516,7 @@ impl CommandQueue {
 
     //#[tracing::instrument(skip_all)]
     pub fn mem_per_pe() -> usize {
-        (CMD_BUF_LEN * CMD_BUFS_PER_PE + 4) * std::mem::size_of::<CmdMsg>()
+        (config().cmd_buf_len * config().cmd_buf_cnt  + 4) * std::mem::size_of::<CmdMsg>()
     }
 }
 

From ea18cc48532be84a34c81a5c5fd72a92a4e17f23 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 22 Apr 2024 13:43:59 -0700
Subject: [PATCH 029/116] cleanup

---
 examples/array_examples/array_am.rs |  1 +
 examples/bandwidths/get_bw.rs       | 10 +++---
 src/array/handle.rs                 |  6 ++--
 src/darc.rs                         | 16 ++++-----
 src/lamellar_request.rs             |  2 --
 src/lib.rs                          | 50 +++++++++++++++++++++++++----
 src/scheduler/work_stealing3.rs     |  2 +-
 7 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/examples/array_examples/array_am.rs b/examples/array_examples/array_am.rs
index 7367d140..145824d2 100644
--- a/examples/array_examples/array_am.rs
+++ b/examples/array_examples/array_am.rs
@@ -89,6 +89,7 @@ fn main() {
         println!("------------------------------------------------------------");
     }
     world.barrier();
+    println!("about to free mem region");
     drop(local_mem_region);
     println!("freed mem region");
     println!("[{:?}] Before {:?}", my_pe, unsafe {
diff --git a/examples/bandwidths/get_bw.rs b/examples/bandwidths/get_bw.rs
index 98f093e8..c14f2144 100644
--- a/examples/bandwidths/get_bw.rs
+++ b/examples/bandwidths/get_bw.rs
@@ -116,11 +116,11 @@ fn main() {
         }
 
         world.barrier();
-        println!(
-            "cleanup: {:?}s {:?}us",
-            s.elapsed().as_secs_f64(),
-            s.elapsed().as_secs_f64() * 1_000_000 as f64
-        );
+        // println!(
+        //     "cleanup: {:?}s {:?}us",
+        //     s.elapsed().as_secs_f64(),
+        //     s.elapsed().as_secs_f64() * 1_000_000 as f64
+        // );
     }
     if my_pe == 0 {
         println!(
diff --git a/src/array/handle.rs b/src/array/handle.rs
index 4b7f2d6b..1d912fc9 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -22,7 +22,6 @@ impl LamellarRequest for ArrayRdmaHandle {
         for req in self.reqs.drain(0..) {
             req.blocking_wait();
         }
-        ()
     }
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
         let mut ready = true;
@@ -61,7 +60,7 @@ impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
     fn blocking_wait(self) -> Self::Output {
         match self.req {
             Some(req) => req.blocking_wait(),
-            None => {}, //this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
+            None => {} //this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
         }
         unsafe { self.buf.as_slice().expect("Data should exist on PE")[0] }
     }
@@ -86,9 +85,8 @@ impl<T: Dist> Future for ArrayRdmaAtHandle<T> {
                 if !req.ready_or_set_waker(cx.waker()) {
                     return Poll::Pending;
                 }
-                
             }
-            None => {},//this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
+            None => {} //this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
         }
         Poll::Ready(unsafe { this.buf.as_slice().expect("Data should exist on PE")[0] })
     }
diff --git a/src/darc.rs b/src/darc.rs
index 176b5d92..bedd32be 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -451,16 +451,16 @@ impl<T> DarcInner<T> {
                 async_std::task::yield_now().await;
             }
 
-            println!(
-                "[{:?}] entering initial block_on barrier()",
-                std::thread::current().id()
-            );
+            // println!(
+            //     "[{:?}] entering initial block_on barrier()",
+            //     std::thread::current().id()
+            // );
             let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
             barrier_fut.await;
-            println!(
-                "[{:?}] leaving initial block_on barrier()",
-                std::thread::current().id()
-            );
+            // println!(
+            //     "[{:?}] leaving initial block_on barrier()",
+            //     std::thread::current().id()
+            // );
 
             while outstanding_refs {
                 outstanding_refs = false;
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 6f9e1bac..bb9cde70 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -62,8 +62,6 @@ pub(crate) enum InternalResult {
 // #[enum_dispatch]
 pub(crate) trait LamellarRequest: Future {
     fn blocking_wait(self) -> Self::Output;
-    // fn ready(&self) -> bool;
-    // fn set_waker(&mut self, waker: &Waker);
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool;
     fn val(&self) -> Self::Output;
 }
diff --git a/src/lib.rs b/src/lib.rs
index bd840db1..10c129d3 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -324,18 +324,36 @@ lazy_static! {
     pub(crate) static ref BINCODE: bincode::config::WithOtherTrailing<bincode::DefaultOptions, bincode::config::AllowTrailing> =
         bincode::DefaultOptions::new().allow_trailing_bytes();
 }
+// use std::sync::atomic::AtomicUsize;
+// use std::sync::atomic::Ordering::SeqCst;
+// use std::sync::Arc;
+// lazy_static! {
+//     pub(crate) static ref SERIALIZE_TIMER: thread_local::ThreadLocal<Arc<AtomicUsize>> =
+//         thread_local::ThreadLocal::new();
+//     pub(crate) static ref DESERIALIZE_TIMER: thread_local::ThreadLocal<Arc<AtomicUsize>> =
+//         thread_local::ThreadLocal::new();
+//     pub(crate) static ref SERIALIZE_SIZE_TIMER: thread_local::ThreadLocal<Arc<AtomicUsize>> =
+//         thread_local::ThreadLocal::new();
+// }
 
 #[doc(hidden)]
 pub fn serialize<T: ?Sized>(obj: &T, var: bool) -> Result<Vec<u8>, anyhow::Error>
 where
     T: serde::Serialize,
 {
-    if var {
+    // let start = std::time::Instant::now();
+    let res = if var {
         // Ok(BINCODE.serialize(obj)?)
         Ok(bincode::serialize(obj)?)
     } else {
         Ok(bincode::serialize(obj)?)
-    }
+    };
+    // unsafe {
+    //     SERIALIZE_TIMER
+    //         .get_or(|| Arc::new(AtomicUsize::new(0)))
+    //         .fetch_add(start.elapsed().as_micros() as usize, SeqCst);
+    // }
+    res
 }
 
 #[doc(hidden)]
@@ -343,24 +361,37 @@ pub fn serialized_size<T: ?Sized>(obj: &T, var: bool) -> usize
 where
     T: serde::Serialize,
 {
-    if var {
+    // let start = std::time::Instant::now();
+    let res = if var {
         // BINCODE.serialized_size(obj).unwrap() as usize
         bincode::serialized_size(obj).unwrap() as usize
     } else {
         bincode::serialized_size(obj).unwrap() as usize
-    }
+    };
+    // unsafe {
+    //     SERIALIZE_SIZE_TIMER
+    //         .get_or(|| Arc::new(AtomicUsize::new(0)))
+    //         .fetch_add(start.elapsed().as_micros() as usize, SeqCst);
+    // }
+    res
 }
 #[doc(hidden)]
 pub fn serialize_into<T: ?Sized>(buf: &mut [u8], obj: &T, var: bool) -> Result<(), anyhow::Error>
 where
     T: serde::Serialize,
 {
+    // let start = std::time::Instant::now();
     if var {
         // BINCODE.serialize_into(buf, obj)?;
         bincode::serialize_into(buf, obj)?;
     } else {
         bincode::serialize_into(buf, obj)?;
     }
+    // unsafe {
+    //     SERIALIZE_TIMER
+    //         .get_or(|| Arc::new(AtomicUsize::new(0)))
+    //         .fetch_add(start.elapsed().as_micros() as usize, SeqCst);
+    // }
     Ok(())
 }
 
@@ -369,12 +400,19 @@ pub fn deserialize<'a, T>(bytes: &'a [u8], var: bool) -> Result<T, anyhow::Error
 where
     T: serde::Deserialize<'a>,
 {
-    if var {
+    // let start = std::time::Instant::now();
+    let res = if var {
         // Ok(BINCODE.deserialize(bytes)?)
         Ok(bincode::deserialize(bytes)?)
     } else {
         Ok(bincode::deserialize(bytes)?)
-    }
+    };
+    // unsafe {
+    //     DESERIALIZE_TIMER
+    //         .get_or(|| Arc::new(AtomicUsize::new(0)))
+    //         .fetch_add(start.elapsed().as_micros() as usize, SeqCst);
+    // }
+    res
 }
 #[doc(hidden)]
 pub use async_std;
diff --git a/src/scheduler/work_stealing3.rs b/src/scheduler/work_stealing3.rs
index 68e56a8f..636c9216 100644
--- a/src/scheduler/work_stealing3.rs
+++ b/src/scheduler/work_stealing3.rs
@@ -308,7 +308,7 @@ impl WorkStealing3 {
     ) -> WorkStealing3 {
         // println!("new work stealing queue");
         let mut ws = WorkStealing3 {
-            max_num_threads: std::cmp::max(1,num_workers-1),
+            max_num_threads: std::cmp::max(1, num_workers - 1),
             threads: Vec::new(),
             imm_inj: Arc::new(Injector::new()),
             work_inj: Arc::new(Injector::new()),

From f751805bcf915a39ed9ba6d038d12c9805fdb040 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 22 Apr 2024 13:44:41 -0700
Subject: [PATCH 030/116] add exec_am_immediately, chnage how immediate tasks
 are handled

---
 src/lamellar_team.rs           | 135 ++++++++++++++++++---------------
 src/scheduler.rs               |  36 ++++++++-
 src/scheduler/work_stealing.rs |   5 +-
 3 files changed, 111 insertions(+), 65 deletions(-)

diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 4195f134..2f53ce08 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1816,70 +1816,72 @@ impl LamellarTeamRT {
         .into()
     }
 
-    //#[tracing::instrument(skip_all)]
-    // pub(crate) fn exec_arc_am_pe_immediate<F>(
-    //     self: &Pin<Arc<LamellarTeamRT>>,
-    //     pe: usize,
-    //     am: LamellarArcAm,
-    //     task_group_cnts: Option<Arc<AMCounters>>,
-    // ) -> Box<dyn LamellarRequest<Output = F>>
-    // where
-    //     F: AmDist,
-    // {
-    //     // println!("team exec arc am pe");
-    //     let tg_outstanding_reqs = match task_group_cnts {
-    //         Some(task_group_cnts) => {
-    //             task_group_cnts.add_send_req(1);
-    //             Some(task_group_cnts.outstanding_reqs.clone())
-    //         }
-    //         None => None,
-    //     };
-    //     assert!(pe < self.arch.num_pes());
-    //     let req = Arc::new(LamellarRequestHandleInner {
-    //         ready: AtomicBool::new(false),
-    //         data: Cell::new(None),
-    //         team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-    //         world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-    //         tg_outstanding_reqs: tg_outstanding_reqs.clone(),
-    //         user_handle: AtomicU8::new(1),
-    //         scheduler: self.scheduler.clone(),
-    //     });
-    //     let req_result = Arc::new(LamellarRequestResult { req: req.clone() });
-    //     let req_ptr = Arc::into_raw(req_result);
-    //     let id = ReqId {
-    //         id: req_ptr as usize,
-    //         sub_id: 0,
-    //     };
-    //     self.world_counters.add_send_req(1);
-    //     self.team_counters.add_send_req(1);
-    //     // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
+    pub(crate) async fn exec_arc_am_pe_immediately<F>(
+        self: &Pin<Arc<LamellarTeamRT>>,
+        pe: usize,
+        am: LamellarArcAm,
+        task_group_cnts: Option<Arc<AMCounters>>,
+    ) -> AmHandle<F>
+    where
+        F: AmDist,
+    {
+        // println!("team exec arc am pe");
+        let tg_outstanding_reqs = match task_group_cnts {
+            Some(task_group_cnts) => {
+                task_group_cnts.add_send_req(1);
+                Some(task_group_cnts.outstanding_reqs.clone())
+            }
+            None => None,
+        };
+        assert!(pe < self.arch.num_pes());
+        let req = Arc::new(AmHandleInner {
+            ready: AtomicBool::new(false),
+            data: Cell::new(None),
+            waker: Mutex::new(None),
+            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
+            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
+            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            user_handle: AtomicU8::new(1),
+            scheduler: self.scheduler.clone(),
+        });
+        let req_result = Arc::new(LamellarRequestResult::Am(req.clone()));
+        let req_ptr = Arc::into_raw(req_result);
+        let id = ReqId {
+            id: req_ptr as usize,
+            sub_id: 0,
+        };
+        self.world_counters.add_send_req(1);
+        self.team_counters.add_send_req(1);
+        // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
-    //     let world = if let Some(world) = &self.world {
-    //         world.clone()
-    //     } else {
-    //         self.clone()
-    //     };
-    //     let req_data = ReqMetaData {
-    //         src: self.world_pe,
-    //         dst: Some(self.arch.world_pe(pe).expect("pe not member of team")),
-    //         id: id,
-    //         lamellae: self.lamellae.clone(),
-    //         world: world,
-    //         team: self.clone(),
-    //         team_addr: self.remote_ptr_addr,
-    //     };
+        let world = if let Some(world) = &self.world {
+            world.clone()
+        } else {
+            self.clone()
+        };
+        let req_data = ReqMetaData {
+            src: self.world_pe,
+            dst: Some(self.arch.world_pe(pe).expect("pe not member of team")),
+            id: id,
+            lamellae: self.lamellae.clone(),
+            world: world,
+            team: self.clone(),
+            team_addr: self.remote_ptr_addr,
+        };
 
-    //     // println!(
-    //     //     "[{:?}] team arc exec am pe immediate",
-    //     //     std::thread::current().id()
-    //     // );
-    //     self.scheduler.submit_am_immediate(Am::Remote(req_data, am));
+        // println!("[{:?}] team arc exec am pe", std::thread::current().id());
+        self.scheduler.exec_am(Am::Remote(req_data, am)).await;
 
-    //     Box::new(LamellarRequestHandle {
-    //         inner: req,
-    //         _phantom: PhantomData,
-    //     })
-    // }
+        // Box::new(LamellarRequestHandle {
+        //     inner: req,
+        //     _phantom: PhantomData,
+        // })
+        AmHandle {
+            inner: req,
+            _phantom: PhantomData,
+        }
+        .into()
+    }
 
     //#[tracing::instrument(skip_all)]
     pub fn exec_am_local<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> LocalAmHandle<F::Output>
@@ -2024,6 +2026,17 @@ impl Drop for LamellarTeamRT {
         // println!("removing {:?} ", self.team_hash);
         self.lamellae.free(self.remote_ptr_addr);
         // println!("LamellarTeamRT dropped {:?}", self.team_hash);
+        // unsafe {
+        //     for duration in crate::SERIALIZE_TIMER.iter() {
+        //         println!("Serialize: {:?}", duration.load(Ordering::SeqCst));
+        //     }
+        //     for duration in crate::SERIALIZE_SIZE_TIMER.iter() {
+        //         println!("Serialize Time: {:?}", duration.load(Ordering::SeqCst));
+        //     }
+        //     for duration in crate::DESERIALIZE_TIMER.iter() {
+        //         println!("Deserialize: {:?}", duration.load(Ordering::SeqCst));
+        //     }
+        // }
     }
 }
 
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 299e2797..d485d156 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -10,7 +10,6 @@ use enum_dispatch::enum_dispatch;
 use futures_util::Future;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
-use std::thread;
 
 pub(crate) mod work_stealing;
 use work_stealing::WorkStealing;
@@ -225,6 +224,39 @@ impl Scheduler {
         self.executor.submit_immediate_task(am_future);
     }
 
+    #[allow(dead_code)]
+    pub(crate) async fn exec_am(&self, am: Am) {
+        let num_ams = self.num_ams.clone();
+        let max_ams = self.max_ams.clone();
+        let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
+        let ame = self.active_message_engine.clone();
+        // let am_future = async move {
+        // let start_tid = thread::current().id();
+        num_ams.fetch_add(1, Ordering::Relaxed);
+        let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
+        // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+        // println!(
+        //     "[{:?}] submit_am_immediate {:?}",
+        //     std::thread::current().id(),
+        //     am_id
+        // );
+        ame.process_msg(am, am_stall_mark, false).await;
+        num_ams.fetch_sub(1, Ordering::Relaxed);
+        // if thread::current().id() != start_tid {
+        //     AM_DIFF_THREAD.fetch_add(1, Ordering::Relaxed);
+        // } else {
+        //     AM_SAME_THREAD.fetch_add(1, Ordering::Relaxed);
+        // }
+        // println!(
+        //     "[{:?}] submit_am_immediate done {:?}",
+        //     std::thread::current().id(),
+        //     am_id
+        // );
+        // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+        // };
+        // self.executor.submit_immediate_task(am_future);
+    }
+
     pub(crate) fn submit_remote_am(&self, data: SerializedData, lamellae: Arc<Lamellae>) {
         let num_ams = self.num_ams.clone();
         let max_ams = self.max_ams.clone();
@@ -306,7 +338,7 @@ impl Scheduler {
             // println!(
             //     "[{:?}] execing new task immediate {:?}",
             //     std::thread::current().id(),
-            //     task_id
+            //     _task_id
             // );
             task.await;
             num_tasks.fetch_sub(1, Ordering::Relaxed);
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index 7bc1216a..fdf674c7 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -176,7 +176,8 @@ impl LamellarExecutor for WorkStealing {
             .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
             .spawn(move |_task_id| async move { task.await }, schedule);
 
-        runnable.run(); //try to run immediately
+        runnable.schedule();
+        // runnable.run(); //try to run immediately
         task.detach();
         // });
     }
@@ -283,7 +284,7 @@ impl WorkStealing {
     ) -> WorkStealing {
         // println!("new work stealing queue");
         let mut ws = WorkStealing {
-            max_num_threads: std::cmp::max(1,num_workers-1),// the main thread does work during blocking_ons and wait_alls
+            max_num_threads: std::cmp::max(1, num_workers - 1), // the main thread does work during blocking_ons and wait_alls
             threads: Vec::new(),
             imm_inj: Arc::new(crossbeam::deque::Injector::new()),
             work_inj: Arc::new(crossbeam::deque::Injector::new()),

From 2a4435d72f4f0bb182e466a5d0e035c4c81e286b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 22 Apr 2024 13:45:39 -0700
Subject: [PATCH 031/116] updates to how block arrays are handled

---
 src/array/unsafe.rs            | 233 +++++++++----
 src/array/unsafe/operations.rs | 599 ++++++++++++++++++---------------
 src/array/unsafe/rdma.rs       |  51 ++-
 3 files changed, 521 insertions(+), 362 deletions(-)

diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 3f04148d..f9753806 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -92,20 +92,24 @@ impl UnsafeByteArrayWeak {
 pub(crate) struct UnsafeArrayInner {
     pub(crate) data: Darc<UnsafeArrayData>,
     pub(crate) distribution: Distribution,
-    orig_elem_per_pe: f64,
+    orig_elem_per_pe: usize,
+    orig_remaining_elems: usize,
     elem_size: usize, //for bytes array will be size of T, for T array will be 1
     offset: usize,    //relative to size of T
     pub(crate) size: usize, //relative to size of T
+    sub: bool,
 }
 
 #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
 pub(crate) struct UnsafeArrayInnerWeak {
     pub(crate) data: WeakDarc<UnsafeArrayData>,
     pub(crate) distribution: Distribution,
-    orig_elem_per_pe: f64,
+    orig_elem_per_pe: usize,
+    orig_remaining_elems: usize,
     elem_size: usize, //for bytes array will be size of T, for T array will be 1
     offset: usize,    //relative to size of T
     size: usize,      //relative to size of T
+    sub: bool,
 }
 
 // impl Drop for UnsafeArrayInner {
@@ -147,9 +151,13 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         let num_pes = team.num_pes();
         let full_array_size = std::cmp::max(array_size, num_pes);
 
-        let elem_per_pe = full_array_size as f64 / num_pes as f64;
-        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
-                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
+        let elem_per_pe = full_array_size / num_pes;
+        let remaining_elems = full_array_size % num_pes;
+        let mut per_pe_size = elem_per_pe;
+        if remaining_elems > 0 {
+            per_pe_size += 1
+        }
+        // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
         let rmr = MemoryRegion::new(
             per_pe_size * std::mem::size_of::<T>(),
             team.lamellae.clone(),
@@ -184,9 +192,11 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
                 distribution: distribution.clone(),
                 // wait: wait,
                 orig_elem_per_pe: elem_per_pe,
+                orig_remaining_elems: remaining_elems,
                 elem_size: std::mem::size_of::<T>(),
                 offset: 0,             //relative to size of T
                 size: full_array_size, //relative to size of T
+                sub: false,
             },
             phantom: PhantomData,
         };
@@ -219,9 +229,12 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         let num_pes = team.num_pes();
         let full_array_size = std::cmp::max(array_size, num_pes);
 
-        let elem_per_pe = full_array_size as f64 / num_pes as f64;
-        let per_pe_size = (full_array_size as f64 / num_pes as f64).ceil() as usize; //we do ceil to ensure enough space an each pe
-                                                                                     // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
+        let elem_per_pe = full_array_size / num_pes;
+        let remaining_elems = full_array_size % num_pes;
+        let mut per_pe_size = elem_per_pe;
+        if remaining_elems > 0 {
+            per_pe_size += 1
+        }
         let rmr = MemoryRegion::new(
             per_pe_size * std::mem::size_of::<T>(),
             team.lamellae.clone(),
@@ -254,9 +267,11 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
                 distribution: distribution.clone(),
                 // wait: wait,
                 orig_elem_per_pe: elem_per_pe,
+                orig_remaining_elems: remaining_elems,
                 elem_size: std::mem::size_of::<T>(),
                 offset: 0,             //relative to size of T
                 size: full_array_size, //relative to size of T
+                sub: false,
             },
             phantom: PhantomData,
         };
@@ -896,7 +911,11 @@ impl<T: Dist> private::LamellarArrayPrivate<T> for UnsafeArray<T> {
         self.inner.pe_for_dist_index(index)
     }
     fn pe_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
-        self.inner.pe_offset_for_dist_index(pe, index)
+        if self.inner.sub {
+            self.inner.pe_sub_offset_for_dist_index(pe, index)
+        } else {
+            self.inner.pe_full_offset_for_dist_index(pe, index)
+        }
     }
 
     unsafe fn into_inner(self) -> UnsafeArray<T> {
@@ -978,9 +997,13 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
 
     //#[tracing::instrument(skip_all)]
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
-        let pe = self.inner.pe_for_dist_index(index)?;
-        let offset = self.inner.pe_offset_for_dist_index(pe, index)?;
-        Some((pe, offset))
+        if self.inner.sub {
+            let pe = self.inner.pe_for_dist_index(index)?;
+            let offset = self.inner.pe_sub_offset_for_dist_index(pe, index)?;
+            Some((pe, offset))
+        } else {
+            self.inner.full_pe_and_offset_for_global_index(index)
+        }
     }
 
     fn first_global_index_for_pe(&self, pe: usize) -> Option<usize> {
@@ -1040,6 +1063,7 @@ impl<T: Dist> SubArray<T> for UnsafeArray<T> {
         let mut inner = self.inner.clone();
         inner.offset += start;
         inner.size = end - start;
+        inner.sub = true;
         UnsafeArray {
             inner: inner,
             phantom: PhantomData,
@@ -1304,9 +1328,11 @@ impl UnsafeArrayInnerWeak {
                 data: data,
                 distribution: self.distribution.clone(),
                 orig_elem_per_pe: self.orig_elem_per_pe,
+                orig_remaining_elems: self.orig_remaining_elems,
                 elem_size: self.elem_size,
                 offset: self.offset,
                 size: self.size,
+                sub: self.sub,
             })
         } else {
             None
@@ -1320,9 +1346,51 @@ impl UnsafeArrayInner {
             data: Darc::downgrade(&array.data),
             distribution: array.distribution.clone(),
             orig_elem_per_pe: array.orig_elem_per_pe,
+            orig_remaining_elems: array.orig_remaining_elems,
             elem_size: array.elem_size,
             offset: array.offset,
             size: array.size,
+            sub: array.sub,
+        }
+    }
+
+    pub(crate) fn full_pe_and_offset_for_global_index(
+        &self,
+        index: usize,
+    ) -> Option<(usize, usize)> {
+        if self.size > index {
+            let mut global_index = index;
+            match self.distribution {
+                Distribution::Block => {
+                    let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);
+                    let mut elem_per_pe = self.orig_elem_per_pe;
+                    if rem_index < self.size {
+                        elem_per_pe += 1;
+                    } else {
+                        global_index = global_index - rem_index;
+                    }
+                    let (pe, offset) = if global_index < rem_index {
+                        (global_index / elem_per_pe, global_index % elem_per_pe)
+                    } else {
+                        (
+                            rem_index / elem_per_pe
+                                + (global_index - rem_index) / self.orig_elem_per_pe,
+                            global_index % self.orig_elem_per_pe,
+                        )
+                    };
+
+                    Some((pe, offset))
+                }
+                Distribution::Cyclic => {
+                    let res = Some((
+                        global_index % self.data.num_pes,
+                        global_index / self.data.num_pes,
+                    ));
+                    res
+                }
+            }
+        } else {
+            None
         }
     }
 
@@ -1330,15 +1398,21 @@ impl UnsafeArrayInner {
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn pe_for_dist_index(&self, index: usize) -> Option<usize> {
         if self.size > index {
-            let global_index = index + self.offset;
+            let mut global_index = index + self.offset;
             match self.distribution {
                 Distribution::Block => {
-                    let mut pe = ((global_index) as f64 / self.orig_elem_per_pe).floor() as usize;
-                    let end_index = (self.orig_elem_per_pe * (pe + 1) as f64).round() as usize;
-                    // println!("pe {:?} size: {:?} index {:?} end_index {:?} global_index {:?}",pe,self.size,index,end_index,global_index);
-                    if global_index >= end_index {
-                        pe += 1;
+                    let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);
+                    let mut elem_per_pe = self.orig_elem_per_pe;
+                    if rem_index < self.size {
+                        elem_per_pe += 1;
+                    } else {
+                        global_index = global_index - rem_index;
                     }
+                    let pe = if global_index < rem_index {
+                        global_index / elem_per_pe
+                    } else {
+                        rem_index / elem_per_pe + (global_index - rem_index) / self.orig_elem_per_pe
+                    };
                     Some(pe)
                 }
                 Distribution::Cyclic => Some(global_index % self.data.num_pes),
@@ -1350,19 +1424,24 @@ impl UnsafeArrayInner {
 
     //index relative to subarray, return offset relative to subarray
     // //#[tracing::instrument(skip_all)]
-    pub fn pe_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
-        let global_index = self.offset + index;
-        let num_elems_local = self.num_elems_pe(pe);
+    pub fn pe_full_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
+        let mut global_index = self.offset + index;
+
         match self.distribution {
             Distribution::Block => {
-                // println!("{:?} {:?} {:?}",pe,index,num_elems_local);
-                let pe_start_index = self.start_index_for_pe(pe)?;
-                let pe_end_index = pe_start_index + num_elems_local;
-                if pe_start_index <= index && index < pe_end_index {
-                    Some(index - pe_start_index)
+                let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);
+                let mut elem_per_pe = self.orig_elem_per_pe;
+                if rem_index < self.size {
+                    elem_per_pe += 1;
                 } else {
-                    None
+                    global_index = global_index - rem_index;
                 }
+                let offset = if global_index < rem_index {
+                    global_index % elem_per_pe
+                } else {
+                    global_index % self.orig_elem_per_pe
+                };
+                Some(offset)
             }
             Distribution::Cyclic => {
                 let num_pes = self.data.num_pes;
@@ -1375,46 +1454,40 @@ impl UnsafeArrayInner {
         }
     }
 
-    //index relative to subarray, return local offset relative to full array
-    // pub fn pe_full_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
-    //     let global_index = self.offset + index;
-    //     println!("{:?} {:?} {:?}",global_index, self.offset, index);
-    //     match self.distribution {
-    //         Distribution::Block => {
-    //             let pe_start_index = (self.orig_elem_per_pe * pe as f64).round() as usize;
-    //             let pe_end_index = (self.orig_elem_per_pe * (pe+1) as f64).round() as usize;
-    //             println!("{:?} {:?}",pe_start_index,pe_end_index);
-    //             if pe_start_index <= global_index && global_index < pe_end_index{
-    //                 Some(global_index - pe_start_index)
-    //             }
-    //             else{
-    //                 None
-    //             }
-    //         }
-    //         Distribution::Cyclic => {
-    //             let num_pes = self.data.num_pes;
-    //             if global_index% num_pes == pe{
-    //                 Some(global_index/num_pes)
-    //             }
-    //             else{
-    //                 None
-    //             }
-    //         }
-    //     }
-    // }
+    //index relative to subarray, return offset relative to subarray
+    pub fn pe_sub_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
+        let offset = self.pe_full_offset_for_dist_index(pe, index)?;
+        match self.distribution {
+            Distribution::Block => {
+                if self.offset <= offset {
+                    Some(offset - self.offset)
+                } else {
+                    None
+                }
+            }
+            Distribution::Cyclic => {
+                let num_pes = self.data.num_pes;
+                if (index + self.offset) % num_pes == pe {
+                    Some(index / num_pes)
+                } else {
+                    None
+                }
+            }
+        }
+    }
 
     //index is local with respect to subarray
     //returns local offset relative to full array
     // //#[tracing::instrument(skip_all)]
     pub fn pe_full_offset_for_local_index(&self, pe: usize, index: usize) -> Option<usize> {
-        // let global_index = self.offset + index;
         let global_index = self.global_index_from_local(index)?;
-        // println!("{:?} {:?} {:?}",global_index, self.offset, index);
         match self.distribution {
             Distribution::Block => {
-                let pe_start_index = (self.orig_elem_per_pe * pe as f64).round() as usize;
-                let pe_end_index = (self.orig_elem_per_pe * (pe + 1) as f64).round() as usize;
-                // println!("{:?} {:?}",pe_start_index,pe_end_index);
+                let pe_start_index = self.global_start_index_for_pe(pe);
+                let mut pe_end_index = pe_start_index + self.orig_elem_per_pe;
+                if pe < self.orig_remaining_elems {
+                    pe_end_index += 1;
+                }
                 if pe_start_index <= global_index && global_index < pe_end_index {
                     Some(global_index - pe_start_index)
                 } else {
@@ -1439,7 +1512,7 @@ impl UnsafeArrayInner {
         let my_pe = self.data.my_pe;
         match self.distribution {
             Distribution::Block => {
-                let global_start = (self.orig_elem_per_pe * my_pe as f64).round() as usize;
+                let global_start = self.global_start_index_for_pe(my_pe);
                 let start = global_start as isize - self.offset as isize;
                 if start >= 0 {
                     //the (sub)array starts before my pe
@@ -1452,7 +1525,10 @@ impl UnsafeArrayInner {
                     }
                 } else {
                     //inner starts on or after my pe
-                    let global_end = (self.orig_elem_per_pe * (my_pe + 1) as f64).round() as usize;
+                    let mut global_end = global_start + self.orig_elem_per_pe;
+                    if my_pe < self.orig_remaining_elems {
+                        global_end += 1;
+                    }
                     if self.offset < global_end {
                         //the (sub)array starts on my pe
                         Some(self.offset + index)
@@ -1539,12 +1615,22 @@ impl UnsafeArrayInner {
         }
     }
 
+    // return index relative to the full array
+    pub(crate) fn global_start_index_for_pe(&self, pe: usize) -> usize {
+        match self.distribution {
+            Distribution::Block => {
+                let mut global_start = self.orig_elem_per_pe * pe;
+                global_start + std::cmp::min(pe, self.orig_remaining_elems)
+            }
+            Distribution::Cyclic => pe,
+        }
+    }
     //return index relative to the subarray
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn start_index_for_pe(&self, pe: usize) -> Option<usize> {
         match self.distribution {
             Distribution::Block => {
-                let global_start = (self.orig_elem_per_pe * pe as f64).round() as usize;
+                let global_start = self.global_start_index_for_pe(pe);
                 let start = global_start as isize - self.offset as isize;
                 if start >= 0 {
                     //the (sub)array starts before my pe
@@ -1556,7 +1642,10 @@ impl UnsafeArrayInner {
                         None
                     }
                 } else {
-                    let global_end = (self.orig_elem_per_pe * (pe + 1) as f64).round() as usize;
+                    let mut global_end = global_start + self.orig_elem_per_pe;
+                    if pe < self.orig_remaining_elems {
+                        global_end += 1;
+                    }
                     if self.offset < global_end {
                         //the (sub)array starts on my pe
                         Some(0)
@@ -1586,20 +1675,25 @@ impl UnsafeArrayInner {
         }
     }
 
+    pub(crate) fn global_end_index_for_pe(&self, pe: usize) -> usize {
+        self.global_start_index_for_pe(pe) + self.num_elems_pe(pe)
+    }
+
     //return index relative to the subarray
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn end_index_for_pe(&self, pe: usize) -> Option<usize> {
-        self.start_index_for_pe(pe)?;
+        let start_i = self.start_index_for_pe(pe)?;
         match self.distribution {
             Distribution::Block => {
+                //(sub)array ends on our pe
                 if pe == self.pe_for_dist_index(self.size - 1)? {
                     Some(self.size - 1)
                 } else {
+                    // (sub)array ends on another pe
                     Some(self.start_index_for_pe(pe + 1)? - 1)
                 }
             }
             Distribution::Cyclic => {
-                let start_i = self.start_index_for_pe(pe)?;
                 let num_elems = self.num_elems_pe(pe);
                 let num_pes = self.data.num_pes;
                 let end_i = start_i + (num_elems - 1) * num_pes;
@@ -1684,13 +1778,16 @@ impl UnsafeArrayInner {
                                                                    // println!("spe {:?} epe {:?}",start_pe,end_pe);
                 let start_index = if my_pe == start_pe {
                     //inner starts on my pe
-                    let global_start = (self.orig_elem_per_pe * my_pe as f64).round() as usize;
+                    let global_start = self.global_start_index_for_pe(my_pe);
                     self.offset - global_start
                 } else {
                     0
                 };
                 let end_index = start_index + num_elems_local;
-                // println!("nel {:?} sao {:?} as slice si: {:?} ei {:?} elemsize {:?}",num_elems_local,self.offset,start_index,end_index,self.elem_size);
+                // println!(
+                //     "nel {:?} sao {:?} as slice si: {:?} ei {:?} elemsize {:?}",
+                //     num_elems_local, self.offset, start_index, end_index, self.elem_size
+                // );
                 &mut slice[start_index * self.elem_size..end_index * self.elem_size]
             }
             Distribution::Cyclic => {
@@ -1727,7 +1824,7 @@ impl UnsafeArrayInner {
                                                                    // println!("spe {:?} epe {:?}",start_pe,end_pe);
                 let start_index = if my_pe == start_pe {
                     //inner starts on my pe
-                    let global_start = (self.orig_elem_per_pe * my_pe as f64).round() as usize;
+                    let global_start = self.global_start_index_for_pe(my_pe);
                     self.offset - global_start
                 } else {
                     0
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index b7ecd855..df0a319b 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -1,13 +1,13 @@
 use crate::active_messaging::LamellarArcAm;
+use crate::array::operations::handle::*;
 use crate::array::operations::*;
 use crate::array::r#unsafe::UnsafeArray;
 use crate::array::{AmDist, Dist, LamellarArray, LamellarByteArray, LamellarEnv};
 use crate::env_var::{config, IndexType};
-use futures_util::Future;
+use crate::AmHandle;
 use parking_lot::Mutex;
 use std::any::TypeId;
-use std::collections::HashMap;
-use std::pin::Pin;
+use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;
 
@@ -103,6 +103,141 @@ impl IndexSize {
             },
         }
     }
+
+    fn create_buf(&self, num_elems: usize) -> IndexBuf {
+        let num_bytes = num_elems * self.len();
+        match self {
+            IndexSize::U8 => {
+                let mut vec = Vec::with_capacity(num_bytes);
+                unsafe {
+                    vec.set_len(num_bytes);
+                }
+                IndexBuf::U8(0, vec)
+            }
+            IndexSize::U16 => {
+                let mut vec = Vec::with_capacity(num_bytes);
+                unsafe {
+                    vec.set_len(num_bytes);
+                }
+                IndexBuf::U16(0, vec)
+            }
+            IndexSize::U32 => {
+                let mut vec = Vec::with_capacity(num_bytes);
+                unsafe {
+                    vec.set_len(num_bytes);
+                }
+                IndexBuf::U32(0, vec)
+            }
+            IndexSize::U64 => {
+                let mut vec = Vec::with_capacity(num_bytes);
+                unsafe {
+                    vec.set_len(num_bytes);
+                }
+                IndexBuf::U64(0, vec)
+            }
+            IndexSize::Usize => {
+                let mut vec = Vec::with_capacity(num_bytes);
+                unsafe {
+                    vec.set_len(num_bytes);
+                }
+                IndexBuf::Usize(0, vec)
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+enum IndexBuf {
+    U8(usize, Vec<u8>),
+    U16(usize, Vec<u8>),
+    U32(usize, Vec<u8>),
+    U64(usize, Vec<u8>),
+    Usize(usize, Vec<u8>),
+}
+
+impl IndexBuf {
+    fn push(&mut self, val: usize) {
+        match self {
+            IndexBuf::U8(i, vec) => {
+                let vec_ptr = vec.as_mut_ptr() as *mut u8;
+                unsafe {
+                    std::ptr::write(vec_ptr.offset(*i as isize), val as u8);
+                }
+                *i += 1;
+            }
+            IndexBuf::U16(i, vec) => {
+                let vec_ptr = vec.as_mut_ptr() as *mut u8 as *mut u16;
+                unsafe {
+                    std::ptr::write(vec_ptr.offset(*i as isize), val as u16);
+                }
+                *i += 1;
+            }
+            IndexBuf::U32(i, vec) => {
+                let vec_ptr = vec.as_mut_ptr() as *mut u8 as *mut u32;
+                unsafe {
+                    std::ptr::write(vec_ptr.offset(*i as isize), val as u32);
+                }
+                *i += 1;
+            }
+            IndexBuf::U64(i, vec) => {
+                let vec_ptr = vec.as_mut_ptr() as *mut u8 as *mut u64;
+                unsafe {
+                    std::ptr::write(vec_ptr.offset(*i as isize), val as u64);
+                }
+                *i += 1;
+            }
+            IndexBuf::Usize(i, vec) => {
+                let vec_ptr = vec.as_mut_ptr() as *mut u8 as *mut usize;
+                unsafe {
+                    std::ptr::write(vec_ptr.offset(*i as isize), val as usize);
+                }
+                *i += 1;
+            }
+        }
+    }
+    fn len(&self) -> usize {
+        match self {
+            IndexBuf::U8(i, _) => *i,
+            IndexBuf::U16(i, _) => *i,
+            IndexBuf::U32(i, _) => *i,
+            IndexBuf::U64(i, _) => *i,
+            IndexBuf::Usize(i, _) => *i,
+        }
+    }
+    fn to_vec(self) -> Vec<u8> {
+        match self {
+            IndexBuf::U8(i, mut vec) => {
+                unsafe {
+                    vec.set_len(i);
+                }
+                vec
+            }
+            IndexBuf::U16(i, mut vec) => {
+                unsafe {
+                    vec.set_len(i * std::mem::size_of::<u16>());
+                }
+                vec
+            }
+            IndexBuf::U32(i, mut vec) => {
+                unsafe {
+                    vec.set_len(i * std::mem::size_of::<u32>());
+                }
+                vec
+            }
+            IndexBuf::U64(i, mut vec) => {
+                unsafe {
+                    vec.set_len(i * std::mem::size_of::<u64>());
+                }
+                vec
+            }
+            IndexBuf::Usize(i, mut vec) => {
+                unsafe {
+                    vec.set_len(i * std::mem::size_of::<usize>());
+                }
+                vec
+            }
+        }
+    }
 }
 
 type IdGen = fn(BatchReturnType) -> (TypeId, TypeId, BatchReturnType);
@@ -157,7 +292,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index: impl OpInput<'a, usize>,
         op: ArrayOpCmd<T>,
         byte_array: LamellarByteArray,
-    ) -> Pin<Box<dyn Future<Output = ()> + Send>> {
+    ) -> ArrayBatchOpHandle {
         let (indices, i_len) = index.as_op_input();
         let (vals, v_len) = val.as_op_input();
 
@@ -166,60 +301,54 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             .max()
             .unwrap();
         let index_size = IndexSize::from(max_local_size);
-        println!("index_size: {:?}", index_size);
-        let data_copied = Arc::new(AtomicBool::new(false));
-        let res: Pin<Box<dyn Future<Output = Vec<((), Vec<usize>)>> + Send>> =
-            if v_len == 1 && i_len == 1 {
-                //one to one
-                self.single_val_single_index::<()>(
-                    byte_array,
-                    vals[0].first(),
-                    indices[0].first(),
-                    op,
-                    BatchReturnType::None,
-                    data_copied.clone(),
-                )
-            } else if v_len > 1 && i_len == 1 {
-                //many vals one index
-                self.multi_val_one_index::<()>(
-                    byte_array,
-                    vals,
-                    indices[0].first(),
-                    op,
-                    BatchReturnType::None,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else if v_len == 1 && i_len > 1 {
-                //one val many indices
-                self.one_val_multi_indices::<()>(
-                    byte_array,
-                    vals[0].first(),
-                    indices,
-                    op,
-                    BatchReturnType::None,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else if v_len > 1 && i_len > 1 {
-                //many vals many indices
-                self.multi_val_multi_index::<()>(
-                    byte_array,
-                    vals,
-                    indices,
-                    op,
-                    BatchReturnType::None,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else {
-                //no vals no indices
-                Box::pin(async { Vec::new() })
-            };
-        Box::pin(async {
-            res.await;
-            ()
-        })
+        let res = if v_len == 1 && i_len == 1 {
+            //one to one
+            self.single_val_single_index::<()>(
+                byte_array.clone(),
+                vals[0].first(),
+                indices[0].first(),
+                op,
+                BatchReturnType::None,
+            )
+        } else if v_len > 1 && i_len == 1 {
+            //many vals one index
+            self.multi_val_one_index::<()>(
+                byte_array.clone(),
+                vals,
+                indices[0].first(),
+                op,
+                BatchReturnType::None,
+                index_size,
+            )
+        } else if v_len == 1 && i_len > 1 {
+            //one val many indices
+            self.one_val_multi_indices::<()>(
+                byte_array.clone(),
+                vals[0].first(),
+                indices,
+                op,
+                BatchReturnType::None,
+                index_size,
+            )
+            .into()
+        } else if v_len > 1 && i_len > 1 {
+            //many vals many indices
+            self.multi_val_multi_index::<()>(
+                byte_array.clone(),
+                vals,
+                indices,
+                op,
+                BatchReturnType::None,
+                index_size,
+            )
+        } else {
+            //no vals no indices
+            VecDeque::new()
+        };
+        ArrayBatchOpHandle {
+            _array: byte_array,
+            reqs: res,
+        }
     }
 
     //#[tracing::instrument(skip_all)]
@@ -229,7 +358,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index: impl OpInput<'a, usize>,
         op: ArrayOpCmd<T>,
         byte_array: LamellarByteArray,
-    ) -> Pin<Box<dyn Future<Output = Vec<T>> + Send>> {
+    ) -> ArrayFetchBatchOpHandle<T> {
         // println!("here in batch fetch op 2");
         let (indices, i_len) = index.as_op_input();
         let (vals, v_len) = val.as_op_input();
@@ -238,71 +367,49 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             .max()
             .unwrap();
         let index_size = IndexSize::from(max_local_size);
-        let data_copied = Arc::new(AtomicBool::new(false));
-        // println!("i_len {:?} v_len {:?}", i_len, v_len);
-        let res: Pin<Box<dyn Future<Output = Vec<(Vec<T>, Vec<usize>)>> + Send>> =
-            if v_len == 1 && i_len == 1 {
-                //one to one
-                self.single_val_single_index::<Vec<T>>(
-                    byte_array,
-                    vals[0].first(),
-                    indices[0].first(),
-                    op,
-                    BatchReturnType::Vals,
-                    data_copied.clone(),
-                )
-            } else if v_len > 1 && i_len == 1 {
-                //many vals one index
-                self.multi_val_one_index::<Vec<T>>(
-                    byte_array,
-                    vals,
-                    indices[0].first(),
-                    op,
-                    BatchReturnType::Vals,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else if v_len == 1 && i_len > 1 {
-                //one val many indices
-                self.one_val_multi_indices::<Vec<T>>(
-                    byte_array,
-                    vals[0].first(),
-                    indices,
-                    op,
-                    BatchReturnType::Vals,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else if v_len > 1 && i_len > 1 {
-                //many vals many indices
-                self.multi_val_multi_index::<Vec<T>>(
-                    byte_array,
-                    vals,
-                    indices,
-                    op,
-                    BatchReturnType::Vals,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else {
-                //no vals no indices
-                panic!("should not be here");
-                // Box::pin(async { Vec::new() })
-            };
-        Box::pin(async move {
-            let mut results = Vec::with_capacity(std::cmp::max(i_len, v_len));
-            unsafe {
-                results.set_len(std::cmp::max(i_len, v_len));
-            }
-            for (mut vals, mut idxs) in res.await.into_iter() {
-                // println!("vals {:?} idx {:?}", vals.len(), idxs);
-                for (v, i) in vals.drain(..).zip(idxs.drain(..)) {
-                    results[i] = v;
-                }
-            }
-            results
-            // res.await.into_iter().flatten().collect::<(Vec<Result<T,T>>, Vec<usize)>()
-        })
+        let res = if v_len == 1 && i_len == 1 {
+            //one to one
+            self.single_val_single_index::<Vec<T>>(
+                byte_array.clone(),
+                vals[0].first(),
+                indices[0].first(),
+                op,
+                BatchReturnType::Vals,
+            )
+        } else if v_len > 1 && i_len == 1 {
+            //many vals one index
+            self.multi_val_one_index::<Vec<T>>(
+                byte_array.clone(),
+                vals,
+                indices[0].first(),
+                op,
+                BatchReturnType::Vals,
+                index_size,
+            )
+        } else if v_len == 1 && i_len > 1 {
+            //one val many indices
+            self.one_val_multi_indices::<Vec<T>>(
+                byte_array.clone(),
+                vals[0].first(),
+                indices,
+                op,
+                BatchReturnType::Vals,
+                index_size,
+            )
+        } else if v_len > 1 && i_len > 1 {
+            //many vals many indices
+            self.multi_val_multi_index::<Vec<T>>(
+                byte_array.clone(),
+                vals,
+                indices,
+                op,
+                BatchReturnType::Vals,
+                index_size,
+            )
+        } else {
+            VecDeque::new()
+        };
+        ArrayFetchBatchOpHandle::new(byte_array, res, std::cmp::max(i_len, v_len))
     }
 
     //#[tracing::instrument(skip_all)]
@@ -312,7 +419,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index: impl OpInput<'a, usize>,
         op: ArrayOpCmd<T>,
         byte_array: LamellarByteArray,
-    ) -> Pin<Box<dyn Future<Output = Vec<Result<T, T>>> + Send>> {
+    ) -> ArrayResultBatchOpHandle<T> {
         let (indices, i_len) = index.as_op_input();
         let (vals, v_len) = val.as_op_input();
         let max_local_size = (0..self.num_pes())
@@ -320,68 +427,50 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             .max()
             .unwrap();
         let index_size = IndexSize::from(max_local_size);
-        let data_copied = Arc::new(AtomicBool::new(false));
-        let res: Pin<Box<dyn Future<Output = Vec<(Vec<Result<T, T>>, Vec<usize>)>> + Send>> =
-            if v_len == 1 && i_len == 1 {
-                //one to one
-                self.single_val_single_index::<Vec<Result<T, T>>>(
-                    byte_array,
-                    vals[0].first(),
-                    indices[0].first(),
-                    op,
-                    BatchReturnType::Result,
-                    data_copied.clone(),
-                )
-            } else if v_len > 1 && i_len == 1 {
-                //many vals one index
-                self.multi_val_one_index::<Vec<Result<T, T>>>(
-                    byte_array,
-                    vals,
-                    indices[0].first(),
-                    op,
-                    BatchReturnType::Result,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else if v_len == 1 && i_len > 1 {
-                //one val many indices
-                self.one_val_multi_indices::<Vec<Result<T, T>>>(
-                    byte_array,
-                    vals[0].first(),
-                    indices,
-                    op,
-                    BatchReturnType::Result,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else if v_len > 1 && i_len > 1 {
-                //many vals many indices
-                self.multi_val_multi_index::<Vec<Result<T, T>>>(
-                    byte_array,
-                    vals,
-                    indices,
-                    op,
-                    BatchReturnType::Result,
-                    index_size,
-                    data_copied.clone(),
-                )
-            } else {
-                //no vals no indices
-                Box::pin(async { Vec::new() })
-            };
-        Box::pin(async move {
-            let mut results = Vec::with_capacity(std::cmp::max(i_len, v_len));
-            unsafe {
-                results.set_len(std::cmp::max(i_len, v_len));
-            }
-            for (mut vals, mut idxs) in res.await.into_iter() {
-                for (v, i) in vals.drain(..).zip(idxs.drain(..)) {
-                    results[i] = v;
-                }
-            }
-            results
-            // res.await.into_iter().flatten().collect::<(Vec<Result<T,T>>, Vec<usize)>()
-        })
+        let res = if v_len == 1 && i_len == 1 {
+            //one to one
+            self.single_val_single_index::<Vec<Result<T, T>>>(
+                byte_array.clone(),
+                vals[0].first(),
+                indices[0].first(),
+                op,
+                BatchReturnType::Result,
+            )
+        } else if v_len > 1 && i_len == 1 {
+            //many vals one index
+            self.multi_val_one_index::<Vec<Result<T, T>>>(
+                byte_array.clone(),
+                vals,
+                indices[0].first(),
+                op,
+                BatchReturnType::Result,
+                index_size,
+            )
+        } else if v_len == 1 && i_len > 1 {
+            //one val many indices
+            self.one_val_multi_indices::<Vec<Result<T, T>>>(
+                byte_array.clone(),
+                vals[0].first(),
+                indices,
+                op,
+                BatchReturnType::Result,
+                index_size,
+            )
+        } else if v_len > 1 && i_len > 1 {
+            //many vals many indices
+            self.multi_val_multi_index::<Vec<Result<T, T>>>(
+                byte_array.clone(),
+                vals,
+                indices,
+                op,
+                BatchReturnType::Result,
+                index_size,
+            )
+        } else {
+            //no vals no indices
+            VecDeque::new()
+        };
+        ArrayResultBatchOpHandle::new(byte_array, res, std::cmp::max(i_len, v_len))
     }
 
     fn one_val_multi_indices<R: AmDist>(
@@ -392,16 +481,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
         index_size: IndexSize,
-        data_copied: Arc<AtomicBool>,
-    ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
-        let num_per_batch = config().batch_op_size;
-        // let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-        //     Ok(n) => n.parse::<usize>().unwrap(),
-        //     Err(_) => 10000,
-        // };
+    ) -> VecDeque<(AmHandle<R>, Vec<usize>)> {
+        let num_per_batch =
+            (config().batch_am_size as f32 / index_size.len() as f32).ceil() as usize;
+
         let num_pes = self.inner.data.team.num_pes();
+        // let my_pe = self.inner.data.team.my_pe();
         let cnt = Arc::new(AtomicUsize::new(0));
-        let futures = Arc::new(Mutex::new(Vec::new()));
+        let futures = Arc::new(Mutex::new(VecDeque::new()));
         let num_reqs = indices.len();
         let mut start_i = 0;
 
@@ -415,19 +502,15 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             self.inner.data.array_counters.add_send_req(1);
             self.inner.data.team.inc_counters(1);
             let index_vec = index.to_vec();
-            // println!("index vec: {:?}", index_vec);
             let the_array: UnsafeArray<T> = self.clone();
-            // println!("num_reqs {:?}", num_reqs);
             self.inner
                 .data
                 .team
                 .scheduler
                 .submit_immediate_task(async move {
-                    let mut buffs =
-                        vec![Vec::with_capacity(num_per_batch * index_size.len()); num_pes];
+                    let mut buffs = vec![index_size.create_buf(num_per_batch); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
-                    let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
-                        Vec::new();
+                    let mut reqs: Vec<(AmHandle<R>, Vec<usize>)> = Vec::new();
 
                     for (ii, idx) in index_vec.iter().enumerate() {
                         let j = ii + start_i;
@@ -439,15 +522,10 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 the_array.inner.size
                             ),
                         };
-                        // println!(
-                        //     "pe: {:?} index: {:?} local_index: {:?}",
-                        //     pe, *idx, local_index
-                        // );
-                        buffs[pe].extend_from_slice(index_size.as_bytes(&local_index));
+                        buffs[pe].push(local_index);
                         res_buffs[pe].push(j);
                         if buffs[pe].len() >= num_per_batch {
-                            let mut new_buffer =
-                                Vec::with_capacity(num_per_batch * index_size.len());
+                            let mut new_buffer = index_size.create_buf(num_per_batch);
                             std::mem::swap(&mut buffs[pe], &mut new_buffer);
                             let mut new_res_buffer = Vec::with_capacity(num_per_batch);
                             std::mem::swap(&mut res_buffs[pe], &mut new_res_buffer);
@@ -455,7 +533,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                             let am = SingleValMultiIndex::new_with_vec(
                                 byte_array2.clone(),
                                 op,
-                                new_buffer,
+                                new_buffer.to_vec(),
                                 val,
                                 index_size,
                             )
@@ -465,7 +543,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 am,
                                 Some(the_array.inner.data.array_counters.clone()),
                             );
-                            reqs.push(Box::pin(async move { (req.await, new_res_buffer) }));
+
+                            reqs.push((req, new_res_buffer));
                         }
                     }
                     for (pe, (buff, res_buff)) in
@@ -475,7 +554,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                             let am = SingleValMultiIndex::new_with_vec(
                                 byte_array2.clone(),
                                 op,
-                                buff,
+                                buff.to_vec(),
                                 val,
                                 index_size,
                             )
@@ -485,10 +564,10 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 am,
                                 Some(the_array.inner.data.array_counters.clone()),
                             );
-                            reqs.push(Box::pin(async move { (req.await, res_buff) }));
+
+                            reqs.push((req, res_buff));
                         }
                     }
-                    // println!("reqs len {:?}", reqs.len());
                     futures2.lock().extend(reqs);
                     cnt2.fetch_add(1, Ordering::SeqCst);
                     the_array
@@ -501,16 +580,12 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                 });
             start_i += len;
         }
-
         // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-        }
-        // println!("futures len {:?}", futures.lock().len());
-        Box::pin(async move {
-            // println!("futures len {:?}", futures.lock().len());
-            futures_util::future::join_all(futures.lock().drain(..)).await
-        })
+        };
+        let res = std::mem::take(&mut *futures.lock());
+        res
     }
 
     // in general this type of operation will likely incur terrible cache performance, the obvious optimization is to apply the updates locally then send it over,
@@ -523,17 +598,13 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
         _index_size: IndexSize,
-        data_copied: Arc<AtomicBool>,
-    ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
-        let num_per_batch = config().batch_op_size;
-        // let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-        //     Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
-        //     Err(_) => 10000,                      //+ 1 to account for main thread
-        // };
+    ) -> VecDeque<(AmHandle<R>, Vec<usize>)> {
+        let num_per_batch =
+            (config().batch_am_size as f32 / std::mem::size_of::<T>() as f32).ceil() as usize;
+
         // println!("multi_val_one_index");
-        // let num_pes = self.inner.data.team.num_pes();
         let cnt = Arc::new(AtomicUsize::new(0));
-        let futures = Arc::new(Mutex::new(Vec::new()));
+        let futures = Arc::new(Mutex::new(VecDeque::new()));
         let (pe, local_index) = match self.pe_and_offset_for_global_index(index) {
             Some((pe, local_index)) => (pe, local_index),
             None => panic!(
@@ -542,7 +613,6 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             ),
         };
         let num_reqs = vals.len();
-        // println!("num_reqs {:?}",num_reqs);
         let mut start_i = 0;
         let scheduler = self.inner.data.team.scheduler.clone();
         for val in vals.drain(..) {
@@ -556,8 +626,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let val_chunks = val.into_vec_chunks(num_per_batch);
             scheduler.submit_immediate_task(async move {
                 let mut inner_start_i = start_i;
-                let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
-                    Vec::new();
+                let mut reqs: Vec<(AmHandle<R>, Vec<usize>)> = Vec::new();
                 val_chunks.into_iter().for_each(|val| {
                     let val_len = val.len();
                     let am = MultiValSingleIndex::new_with_vec(
@@ -572,13 +641,11 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                         am,
                         Some(the_array.inner.data.array_counters.clone()),
                     );
-                    // println!("start_i: {:?} inner_start_i {:?} val_len: {:?}",start_i,inner_start_i,val_len);
                     let res_buffer =
                         (inner_start_i..inner_start_i + val_len).collect::<Vec<usize>>();
-                    reqs.push(Box::pin(async move { (req.await, res_buffer) }));
+                    reqs.push((req, res_buffer));
                     inner_start_i += val_len;
                 });
-                // println!("reqs len {:?}",reqs.len());
                 futures2.lock().extend(reqs);
                 cnt2.fetch_add(1, Ordering::SeqCst);
                 the_array
@@ -596,11 +663,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
         }
-        // println!("futures len {:?}",futures.lock().len());
-        Box::pin(async move {
-            // println!("futures len {:?}",futures.lock().len());
-            futures_util::future::join_all(futures.lock().drain(..)).await
-        })
+        let res = std::mem::take(&mut *futures.lock());
+        res
     }
 
     fn multi_val_multi_index<R: AmDist>(
@@ -611,24 +675,20 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
         index_size: IndexSize,
-        data_copied: Arc<AtomicBool>,
-    ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
-        let num_per_batch = config().batch_op_size;
-        // let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-        //     Ok(n) => n.parse::<usize>().unwrap(), //+ 1 to account for main thread
-        //     Err(_) => 10000,                      //+ 1 to account for main thread
-        // };
-        let bytes_per_batch = match index_size {
-            IndexSize::U8 => num_per_batch * std::mem::size_of::<IdxVal<u8, T>>(),
-            IndexSize::U16 => num_per_batch * std::mem::size_of::<IdxVal<u16, T>>(),
-            IndexSize::U32 => num_per_batch * std::mem::size_of::<IdxVal<u32, T>>(),
-            IndexSize::U64 => num_per_batch * std::mem::size_of::<IdxVal<u64, T>>(),
-            IndexSize::Usize => num_per_batch * std::mem::size_of::<IdxVal<usize, T>>(),
+    ) -> VecDeque<(AmHandle<R>, Vec<usize>)> {
+        let idx_val_bytes = match index_size {
+            IndexSize::U8 => std::mem::size_of::<IdxVal<u8, T>>(),
+            IndexSize::U16 => std::mem::size_of::<IdxVal<u16, T>>(),
+            IndexSize::U32 => std::mem::size_of::<IdxVal<u32, T>>(),
+            IndexSize::U64 => std::mem::size_of::<IdxVal<u64, T>>(),
+            IndexSize::Usize => std::mem::size_of::<IdxVal<usize, T>>(),
         };
+        let num_per_batch = (config().batch_am_size as f32 / idx_val_bytes as f32).ceil() as usize;
+        let bytes_per_batch = num_per_batch * idx_val_bytes;
 
         let num_pes = self.inner.data.team.num_pes();
         let cnt = Arc::new(AtomicUsize::new(0));
-        let futures = Arc::new(Mutex::new(Vec::new()));
+        let futures = Arc::new(Mutex::new(VecDeque::new()));
         let num_reqs = vals.len();
 
         // println!("num_reqs {:?}", num_reqs);
@@ -644,18 +704,14 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let index_vec = index.to_vec();
             let vals_vec = val.to_vec();
             let the_array: UnsafeArray<T> = self.clone();
-            // println!("trying to submit immediate task");
             self.inner
                 .data
                 .team
                 .scheduler
                 .submit_immediate_task(async move {
-                    // println!("in immediate task");
                     let mut buffs = vec![Vec::with_capacity(bytes_per_batch); num_pes];
                     let mut res_buffs = vec![Vec::with_capacity(num_per_batch); num_pes];
-                    let mut reqs: Vec<Pin<Box<dyn Future<Output = (R, Vec<usize>)> + Send>>> =
-                        Vec::new();
-                    // let mut res_index = 0;
+                    let mut reqs: Vec<(AmHandle<R>, Vec<usize>)> = Vec::new();
                     for (ii, (idx, val)) in
                         index_vec.into_iter().zip(vals_vec.into_iter()).enumerate()
                     {
@@ -711,8 +767,6 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                             std::mem::swap(&mut buffs[pe], &mut new_buffer);
                             let mut new_res_buffer = Vec::with_capacity(num_per_batch);
                             std::mem::swap(&mut res_buffs[pe], &mut new_res_buffer);
-
-                            // println!("buff len {}",new_buffer.len());
                             let am = MultiValMultiIndex::new_with_vec(
                                 byte_array2.clone(),
                                 op,
@@ -725,14 +779,13 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 am,
                                 Some(the_array.inner.data.array_counters.clone()),
                             );
-                            reqs.push(Box::pin(async move { (req.await, new_res_buffer) }));
+                            reqs.push((req, new_res_buffer));
                         }
                     }
                     for (pe, (buff, res_buff)) in
                         buffs.into_iter().zip(res_buffs.into_iter()).enumerate()
                     {
                         if buff.len() > 0 {
-                            // println!("buff len {}",buff.len());
                             let am = MultiValMultiIndex::new_with_vec(
                                 byte_array2.clone(),
                                 op,
@@ -745,7 +798,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                                 am,
                                 Some(the_array.inner.data.array_counters.clone()),
                             );
-                            reqs.push(Box::pin(async move { (req.await, res_buff) }));
+                            reqs.push((req, res_buff));
                         }
                     }
                     futures2.lock().extend(reqs);
@@ -764,11 +817,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
         }
-        // println!("futures len {:?}", futures.lock().len());
-        Box::pin(async move {
-            // println!("futures len: {:?}", futures.lock().len());
-            futures_util::future::join_all(futures.lock().drain(..)).await
-        })
+        let res = std::mem::take(&mut *futures.lock());
+        res
     }
 
     fn single_val_single_index<R: AmDist>(
@@ -778,8 +828,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index: usize,
         op: ArrayOpCmd<T>,
         ret: BatchReturnType,
-        data_copied: Arc<AtomicBool>,
-    ) -> Pin<Box<dyn Future<Output = Vec<(R, Vec<usize>)>> + Send>> {
+    ) -> VecDeque<(AmHandle<R>, Vec<usize>)> {
         let (pe, local_index) = match self.pe_and_offset_for_global_index(index) {
             Some((pe, local_index)) => (pe, local_index),
             None => panic!(
@@ -803,9 +852,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             am,
             Some(self.inner.data.array_counters.clone()),
         );
-        let mut reqs = vec![Box::pin(async move { (req.await, res_buff) })];
-
-        Box::pin(async move { futures_util::future::join_all(reqs.drain(..)).await })
+        VecDeque::from(vec![(req, res_buff)])
     }
 }
 
@@ -840,7 +887,7 @@ impl SingleValMultiIndex {
             val: unsafe { std::slice::from_raw_parts(val_u8, std::mem::size_of::<T>()) }.to_vec(),
             op: op.into(),
             index_size: index_size,
-        } //, type_id: TypeId::of::<T>() }
+        }
     }
 
     fn into_am<T: Dist>(self, ret: BatchReturnType) -> LamellarArcAm {
@@ -881,7 +928,7 @@ impl MultiValSingleIndex {
             }
             .to_vec(),
             op: op.into(),
-        } //, type_id: TypeId::of::<T>() }
+        }
     }
 
     fn into_am<T: Dist>(self, ret: BatchReturnType) -> LamellarArcAm {
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index a3fa7677..2d531925 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -33,19 +33,32 @@ impl<T: Dist> UnsafeArray<T> {
             ),
         };
         // .expect("index out of bounds"); //(((index + buf.len()) as f64) / self.elem_per_pe).round() as usize;
-        // println!("block_op {:?} {:?}",start_pe,end_pe);
+        // println!(
+        //     "block_op index: {:?} global_index: {:?} spe: {:?} epe: {:?}",
+        //     index, global_index, start_pe, end_pe
+        // );
         let mut dist_index = global_index;
         // let mut subarray_index = index;
         let mut buf_index = 0;
         let mut reqs = VecDeque::new();
         for pe in start_pe..=end_pe {
-            let num_elems_on_pe = (self.inner.orig_elem_per_pe * (pe + 1) as f64).round() as usize
-                - (self.inner.orig_elem_per_pe * pe as f64).round() as usize;
-            let pe_start_index = (self.inner.orig_elem_per_pe * pe as f64).round() as usize;
-            let offset = dist_index - pe_start_index;
-            let len = std::cmp::min(num_elems_on_pe - offset, buf.len() - buf_index);
+            let mut full_num_elems_on_pe = self.inner.orig_elem_per_pe;
+            if pe < self.inner.orig_remaining_elems {
+                full_num_elems_on_pe += 1;
+            }
+            let pe_full_start_index = self.inner.global_start_index_for_pe(pe);
+            // let mut pe_full_start_index = self.inner.orig_elem_per_pe * pe;
+            // if pe < self.inner.orig_remaining_elems {
+            //     pe_full_start_index += pe;
+            // }
+
+            // let full_num_elems_on_pe = (self.inner.orig_elem_per_pe * (pe + 1) as f64).round() as usize
+            //     - (self.inner.orig_elem_per_pe * pe as f64).round() as usize;
+            // let pe_start_index = (self.inner.orig_elem_per_pe * pe as f64).round() as usize;
+            let offset = dist_index - pe_full_start_index;
+            let len = std::cmp::min(full_num_elems_on_pe - offset, buf.len() - buf_index);
             if len > 0 {
-                // println!("pe {:?} offset {:?} range: {:?}-{:?} dist_index {:?} pe_start_index {:?} num_elems {:?} len {:?}", pe, offset, buf_index, buf_index+len, dist_index, pe_start_index, num_elems_on_pe, len);
+                // println!("pe {:?} offset {:?} range: {:?}-{:?} dist_index {:?} pe_full_start_index {:?} num_elems {:?} len {:?}", pe, offset, buf_index, buf_index+len, dist_index, pe_full_start_index, full_num_elems_on_pe, len);
                 match op {
                     ArrayRdmaCmd::Put => unsafe {
                         self.inner.data.mem_region.blocking_put(
@@ -74,9 +87,7 @@ impl<T: Dist> UnsafeArray<T> {
                         // unsafe{
                         //     println!("{:?} {:?},",buf.clone().to_base::<u8>().as_slice(), buf.sub_region(buf_index..(buf_index + len)).to_base::<u8>().as_slice());
                         // }
-                        if buf.len() * std::mem::size_of::<T>()
-                            > crate::active_messaging::BATCH_AM_SIZE
-                        {
+                        if buf.len() * std::mem::size_of::<T>() > config().batch_am_size {
                             let am = UnsafePutAm {
                                 array: self.clone().into(),
                                 start_index: index,
@@ -106,7 +117,7 @@ impl<T: Dist> UnsafeArray<T> {
                         }
                     }
                     ArrayRdmaCmd::GetAm => {
-                        // if buf.len()*std::mem::size_of::<T>() > crate::active_messaging::BATCH_AM_SIZE{
+                        // if buf.len()*std::mem::size_of::<T>() > config().batch_am_size{
                         let am = UnsafeBlockGetAm {
                             array: self.clone().into(),
                             offset: offset,
@@ -200,8 +211,7 @@ impl<T: Dist> UnsafeArray<T> {
                     // println!("{:?}",temp_memreg.clone().to_base::<u8>().as_slice());
                     // println!("si: {:?} ei {:?}",offset,offset+k);
 
-                    if buf.len() * std::mem::size_of::<T>() > crate::active_messaging::BATCH_AM_SIZE
-                    {
+                    if buf.len() * std::mem::size_of::<T>() > config().batch_am_size {
                         let am = UnsafePutAm {
                             array: self.clone().into(),
                             start_index: index,
@@ -273,7 +283,7 @@ impl<T: Dist> UnsafeArray<T> {
                 }
             }
             ArrayRdmaCmd::GetAm => {
-                // if buf.len()*std::mem::size_of::<T>() > crate::active_messaging::BATCH_AM_SIZE{
+                // if buf.len()*std::mem::size_of::<T>() > config().batch_am_size{
                 let rem = buf.len() % num_pes;
                 for i in 0..std::cmp::min(buf.len(), num_pes) {
                     let temp_memreg = self
@@ -706,8 +716,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for UnsafeArray<T> {
         buf: U,
     ) -> ArrayRdmaHandle {
         let buf = buf.into();
-        let reqs = if buf.len() * std::mem::size_of::<T>() > crate::active_messaging::BATCH_AM_SIZE
-        {
+        let reqs = if buf.len() * std::mem::size_of::<T>() > config().batch_am_size {
             match self.inner.distribution {
                 Distribution::Block => self.block_op(ArrayRdmaCmd::GetAm, index, buf),
                 Distribution::Cyclic => self.cyclic_op(ArrayRdmaCmd::GetAm, index, buf),
@@ -825,7 +834,10 @@ impl UnsafeArrayInner {
             ),
         };
 
-        // println!("i {:?} len {:?} spe {:?} epe {:?}  ",index,len,start_pe,end_pe);
+        // println!(
+        //     "i {:?} len {:?} spe {:?} epe {:?}  ",
+        //     index, len, start_pe, end_pe
+        // );
         match self.distribution {
             Distribution::Block => {
                 let num_elems_local = self.num_elems_local();
@@ -846,7 +858,10 @@ impl UnsafeArrayInner {
                 } else {
                     num_elems_local
                 };
-                // println!("ssi {:?} si {:?} ei {:?} nel {:?} es {:?}",subarray_start_index,start_index,end_index,num_elems_local,self.elem_size);
+                // println!(
+                //     "ssi {:?} si {:?} ei {:?} nel {:?} es {:?}",
+                //     subarray_start_index, start_index, end_index, num_elems_local, self.elem_size
+                // );
                 Some((
                     &mut self.local_as_mut_slice()
                         [start_index * self.elem_size..end_index * self.elem_size],

From 6e1baa1abf8162ba72e32d0cd8487f84c12fe8e2 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 16 May 2024 12:01:34 -0700
Subject: [PATCH 032/116] set result  vector capacity for batched array ops

---
 impl/src/array_ops.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index bb5b6fe9..f39452ce 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -1290,10 +1290,11 @@ fn create_buf_ops(
                 // println!("in single val multi idx exec");
                 #slice
                 let val = self.val;
-                let mut res = Vec::new();
+                let mut res;
                 match self.index_size{
                     1 => {
                         let indices = unsafe {std::slice::from_raw_parts(self.indices.as_ptr() as *const u8, self.indices.len()/std::mem::size_of::<u8>())};
+                        res = Vec::with_capacity(self.indices.len()/std::mem::size_of::<u8>());
                         // println!("indices: {:?}", indices);
                         match self.op {
                             #single_val_multi_idx_fetch_match_stmts
@@ -1301,24 +1302,28 @@ fn create_buf_ops(
                     }
                     2 => {
                         let indices = unsafe {std::slice::from_raw_parts(self.indices.as_ptr() as *const u16, self.indices.len()/std::mem::size_of::<u16>())};
+                        res = Vec::with_capacity(self.indices.len()/std::mem::size_of::<u16>());
                         match self.op {
                             #single_val_multi_idx_fetch_match_stmts
                         }
                     }
                     4 => {
                         let indices = unsafe {std::slice::from_raw_parts(self.indices.as_ptr() as *const u32, self.indices.len()/std::mem::size_of::<u32>())};
+                        res = Vec::with_capacity(self.indices.len()/std::mem::size_of::<u32>());
                         match self.op {
                             #single_val_multi_idx_fetch_match_stmts
                         }
                     }
                     8 => {
                         let indices = unsafe {std::slice::from_raw_parts(self.indices.as_ptr() as *const u64, self.indices.len()/std::mem::size_of::<u64>())};
+                        res = Vec::with_capacity(self.indices.len()/std::mem::size_of::<u64>());
                         match self.op {
                             #single_val_multi_idx_fetch_match_stmts
                         }
                     }
                     _ => {
                         let indices = unsafe {std::slice::from_raw_parts(self.indices.as_ptr() as *const usize, self.indices.len()/std::mem::size_of::<usize>())};
+                        res = Vec::with_capacity(self.indices.len()/std::mem::size_of::<usize>());
                         match self.op {
                             #single_val_multi_idx_fetch_match_stmts
                         }

From 51a97062b969f7bf46b4a068e91b47476587ab22 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 16 May 2024 12:03:15 -0700
Subject: [PATCH 033/116] Some clean up and comments

---
 src/array.rs     |  2 ++
 src/barrier.rs   |  6 ++++--
 src/env_var.rs   |  8 ++++----
 src/scheduler.rs | 23 -----------------------
 4 files changed, 10 insertions(+), 29 deletions(-)

diff --git a/src/array.rs b/src/array.rs
index 55a454b0..35276813 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -106,6 +106,8 @@ use std::sync::Arc;
 /// Alternatively, if you plan to derive all the above traits you can simply supply `All` as the single argument to [ArrayOps]
 pub use lamellar_impl::ArrayOps;
 
+use crate::memregion::RemoteMemoryRegion;
+
 #[doc(hidden)]
 pub mod prelude;
 
diff --git a/src/barrier.rs b/src/barrier.rs
index f637bd56..14e10713 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -31,7 +31,7 @@ impl Barrier {
         scheduler: Arc<Scheduler>,
         panic: Arc<AtomicU8>,
     ) -> Barrier {
-        let num_pes = arch.num_pes;
+        let num_pes = arch.num_pes();
         // let mut n = std::env::var("LAMELLAR_BARRIER_DISSEMNATION_FACTOR")
         let mut n = config().barrier_dissemination_factor;
         let num_rounds = if n > 1 && num_pes > 2 {
@@ -42,7 +42,7 @@ impl Barrier {
         };
         let (buffs, send_buf) = if let Ok(_my_index) = arch.team_pe(my_pe) {
             if num_pes > 1 {
-                let alloc = if global_pes == arch.num_pes {
+                let alloc = if global_pes == arch.num_pes() {
                     AllocationType::Global
                 } else {
                     let mut pes = arch.team_iter().collect::<Vec<usize>>();
@@ -263,6 +263,8 @@ impl Barrier {
                 }
             }
         }
+        // println!("leaving barrier");
+        // self.print_bar();
         // self.lamellae.flush();
     }
 
diff --git a/src/env_var.rs b/src/env_var.rs
index a67828e7..640e80ab 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -88,7 +88,7 @@ pub struct Config {
     #[serde(default = "default_deadlock_timeout")]
     pub deadlock_timeout: f64,
     #[serde(default = "default_op_batch")]
-    pub batch_op_size: usize,
+    pub batch_op_size: usize, // am group batch size
     #[serde(default = "default_dissemination_factor")]
     pub barrier_dissemination_factor: usize,
     // #[serde(default=true)]
@@ -101,7 +101,7 @@ pub struct Config {
     pub batcher: String,
     #[serde(default = "default_threads")]
     pub threads: usize,
-    pub batch_op_threads: Option<usize>,
+    pub batch_op_threads: Option<usize>,//number of threads used to process array batch ops sending
     pub heap_size: Option<usize>,
     #[serde(default = "default_heap_mode")]
     pub heap_mode: HeapMode,
@@ -114,14 +114,14 @@ pub struct Config {
     #[serde(default = "default_cmd_buf_cnt")]
     pub cmd_buf_cnt: usize,
     #[serde(default = "default_batch_am_size")]
-    pub batch_am_size: usize,
+    pub batch_am_size: usize, //the threshold for an activemessage (in bytes) on whether it will be sent directly or aggregated
 }
 
 pub fn config() -> &'static Config {
     static CONFIG: OnceLock<Config> = OnceLock::new();
     CONFIG.get_or_init(|| match envy::prefixed("LAMELLAR_").from_env::<Config>() {
         Ok(config) => {
-            println!("[LAMELLAR CONFIG]{config:?}");
+            // println!("[LAMELLAR CONFIG]{config:?}");
             config
         }
         Err(error) => panic!("{}", error),
diff --git a/src/scheduler.rs b/src/scheduler.rs
index d485d156..82d95f24 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -511,29 +511,6 @@ pub(crate) fn create_scheduler(
         _ => panic!("[LAMELLAR ERROR] unexpected batcher type please set LAMELLAR_BATCHER to one of 'simple' or 'team_am'")
     };
 
-    // let batcher = match std::env::var("LAMELLAR_BATCHER") {
-    //     Ok(n) => {
-    //         let n = n.parse::<usize>().unwrap();
-    //         if n == 1 {
-    //             BatcherType::Simple(SimpleBatcher::new(
-    //                 num_pes,
-    //                 am_stall_mark.clone(),
-    //                 executor.clone(),
-    //             ))
-    //         } else {
-    //             BatcherType::TeamAm(TeamAmBatcher::new(
-    //                 num_pes,
-    //                 am_stall_mark.clone(),
-    //                 executor.clone(),
-    //             ))
-    //         }
-    //     }
-    //     Err(_) => BatcherType::TeamAm(TeamAmBatcher::new(
-    //         num_pes,
-    //         am_stall_mark.clone(),
-    //         executor.clone(),
-    //     )),
-    // };
     Scheduler::new(
         executor.clone(),
         RegisteredActiveMessages::new(batcher, executor),

From f541bf956d2095173a54d3cdc1f494631de70f73 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 16 May 2024 12:03:57 -0700
Subject: [PATCH 034/116] yet more changes to darc await on outstanding
 algorithm

---
 src/darc.rs | 212 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 156 insertions(+), 56 deletions(-)

diff --git a/src/darc.rs b/src/darc.rs
index bedd32be..7c5c0183 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -82,7 +82,6 @@ static DARC_ID: AtomicUsize = AtomicUsize::new(0);
 #[repr(u8)]
 #[derive(PartialEq, Debug, Copy, Clone)]
 pub(crate) enum DarcMode {
-    Dropped,
     Darc,
     LocalRw,
     GlobalRw,
@@ -94,6 +93,9 @@ pub(crate) enum DarcMode {
     NativeAtomicArray,
     LocalLockArray,
     GlobalLockArray,
+    Dropping,
+    Dropped,
+    RestartDrop,
 }
 
 #[lamellar_impl::AmDataRT(Debug)]
@@ -300,7 +302,7 @@ impl<T> crate::active_messaging::DarcSerde for Darc<T> {
     }
 }
 
-impl<T> DarcInner<T> {
+impl<T: 'static> DarcInner<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         unsafe {
             Arc::increment_strong_count(self.team);
@@ -413,10 +415,82 @@ impl<T> DarcInner<T> {
     //     );
     // }
 
+    async fn wait_on_state(
+        inner: WrappedInner<T>,
+        mode_refs: &[u8],
+        state: u8,
+        extra_cnt: usize,
+        reset: bool,
+    ) -> bool {
+        for pe in mode_refs.iter() {
+            let timer = std::time::Instant::now();
+            while *pe != state as u8 {
+                if inner.local_cnt.load(Ordering::SeqCst) == 1 + extra_cnt {
+                    join_all(inner.send_finished()).await;
+                }
+                if !reset && timer.elapsed().as_secs_f64() > config().deadlock_timeout {
+                    let ref_cnts_slice = unsafe {
+                        std::slice::from_raw_parts_mut(
+                            inner.ref_cnt_addr as *mut usize,
+                            inner.num_pes,
+                        )
+                    };
+                    println!("[{:?}][{:?}][WARNING] -- Potential deadlock detected.\n\
+                        The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
+                        The object is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
+                        An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
+                        The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                        To view backtrace set RUST_LIB_BACKTRACE=1\n\
+                        {}",
+                        inner.my_pe,
+                        std::thread::current().id(),
+                        unsafe {
+                            &std::slice::from_raw_parts_mut(inner.mode_addr as *mut DarcMode, inner.num_pes)
+                        },
+                        inner.local_cnt.load(Ordering::SeqCst),
+                        inner.dist_cnt.load(Ordering::SeqCst),
+                        config().deadlock_timeout,
+                        std::backtrace::Backtrace::capture()
+                    );
+                }
+                if reset && timer.elapsed().as_secs_f64() > config().deadlock_timeout / 2.0 {
+                    return false;
+                }
+                if reset && mode_refs.iter().any(|x| *x == DarcMode::RestartDrop as u8) {
+                    return false;
+                }
+                async_std::task::yield_now().await;
+            }
+        }
+        true
+    }
+
+    fn broadcast_state(
+        inner: WrappedInner<T>,
+        team: Pin<Arc<LamellarTeamRT>>,
+        mode_refs: &mut [u8],
+        state: u8,
+    ) {
+        unsafe {
+            (*(((&mut mode_refs[inner.my_pe]) as *mut u8) as *mut AtomicU8)) //this should be fine given that DarcMode uses Repr(u8)
+                .store(state as u8, Ordering::SeqCst)
+        };
+        let rdma = &team.lamellae;
+        for pe in team.arch.team_iter() {
+            // println!("darc block_on_outstanding put 3");
+            rdma.put(
+                pe,
+                &mode_refs[inner.my_pe..=inner.my_pe],
+                inner.mode_addr + inner.my_pe * std::mem::size_of::<DarcMode>(),
+            );
+        }
+    }
+
     async fn block_on_outstanding(inner: WrappedInner<T>, state: DarcMode, extra_cnt: usize) {
         let team = inner.team();
         let mode_refs =
             unsafe { std::slice::from_raw_parts_mut(inner.mode_addr as *mut u8, inner.num_pes) };
+        let orig_state = mode_refs[inner.my_pe];
         if team.num_pes() == 1 {
             while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
                 async_std::task::yield_now().await;
@@ -450,11 +524,15 @@ impl<T> DarcInner<T> {
             while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
                 async_std::task::yield_now().await;
             }
+            join_all(inner.send_finished()).await;
 
             // println!(
             //     "[{:?}] entering initial block_on barrier()",
             //     std::thread::current().id()
             // );
+            if !Self::wait_on_state(inner.clone(), mode_refs, orig_state, extra_cnt, false).await {
+                panic!("deadlock waiting for original state");
+            }
             let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
             barrier_fut.await;
             // println!(
@@ -463,6 +541,35 @@ impl<T> DarcInner<T> {
             // );
 
             while outstanding_refs {
+                if mode_refs.iter().any(|x| *x == DarcMode::RestartDrop as u8) {
+                    Self::broadcast_state(
+                        inner.clone(),
+                        team.clone(),
+                        mode_refs,
+                        DarcMode::RestartDrop as u8,
+                    );
+                    if !(Self::wait_on_state(
+                        inner.clone(),
+                        mode_refs,
+                        DarcMode::RestartDrop as u8,
+                        extra_cnt,
+                        false,
+                    )
+                    .await)
+                    {
+                        panic!("deadlock");
+                    }
+                    Self::broadcast_state(inner.clone(), team.clone(), mode_refs, orig_state);
+                    // team.scheduler.submit_task(async move {
+                    Box::pin(DarcInner::block_on_outstanding(
+                        inner.clone(),
+                        state,
+                        extra_cnt,
+                    ))
+                    .await;
+                    // });
+                    return;
+                }
                 outstanding_refs = false;
                 // these hopefully all get set to non zero later otherwise we still need to wait
                 for id in &mut *barrier_slice {
@@ -501,7 +608,7 @@ impl<T> DarcInner<T> {
                         //     inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>()
                         // );
                         // println!("darc block_on_outstanding put 1");
-                        rdma.put(
+                        rdma.iput(
                             send_pe,
                             ref_cnt_u8,
                             inner.mode_ref_cnt_addr + inner.my_pe * std::mem::size_of::<usize>(), //this is barrier_ref_cnt_slice
@@ -571,7 +678,7 @@ impl<T> DarcInner<T> {
                         )
                     };
                     // println!("darc block_on_outstanding put 2");
-                    rdma.put(
+                    rdma.iput(
                         send_pe,
                         barrier_id_slice,
                         inner.mode_barrier_addr + inner.my_pe * std::mem::size_of::<usize>(),
@@ -595,6 +702,7 @@ impl<T> DarcInner<T> {
                 //     dist_cnts_changed: {dist_cnts_changed:?} barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(), inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
                 // }
                 barrier_id = old_barrier_id + 1;
+
                 // if outstanding_refs {
                 //     // println!(
                 //     //     "[{:?}] still outstanding, exec a task!",
@@ -613,54 +721,35 @@ impl<T> DarcInner<T> {
             // );
             // inner.debug_print();
             // println!("[{:?}] {:?}", std::thread::current().id(), inner);
-
-            unsafe {
-                (*(((&mut mode_refs[inner.my_pe]) as *mut u8) as *mut AtomicU8)) //this should be fine given that DarcMode uses Repr(u8)
-                    .store(state as u8, Ordering::SeqCst)
-            };
-            let rdma = &team.lamellae;
-            for pe in team.arch.team_iter() {
-                // println!("darc block_on_outstanding put 3");
-                rdma.put(
-                    pe,
-                    &mode_refs[inner.my_pe..=inner.my_pe],
-                    inner.mode_addr + inner.my_pe * std::mem::size_of::<DarcMode>(),
+            Self::broadcast_state(inner.clone(), team.clone(), mode_refs, state as u8);
+            if !Self::wait_on_state(inner.clone(), mode_refs, state as u8, extra_cnt, true).await {
+                Self::broadcast_state(
+                    inner.clone(),
+                    team.clone(),
+                    mode_refs,
+                    DarcMode::RestartDrop as u8,
                 );
-            }
-            for pe in mode_refs.iter() {
-                let mut timer = std::time::Instant::now();
-                while *pe != state as u8 {
-                    if inner.local_cnt.load(Ordering::SeqCst) == 1 + extra_cnt {
-                        join_all(inner.send_finished()).await;
-                    }
-                    if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
-                        let ref_cnts_slice = unsafe {
-                            std::slice::from_raw_parts_mut(
-                                inner.ref_cnt_addr as *mut usize,
-                                inner.num_pes,
-                            )
-                        };
-                        println!("[{:?}][{:?}][WARNING] -- Potential deadlock detected.\n\
-                            The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
-                            The object is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
-                            An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
-                            The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
-                            To view backtrace set RUST_LIB_BACKTRACE=1\n\
-                            {}",
-                            inner.my_pe,
-                            std::thread::current().id(),
-                            unsafe {
-                                &std::slice::from_raw_parts_mut(inner.mode_addr as *mut DarcMode, inner.num_pes)
-                            },
-                            inner.local_cnt.load(Ordering::SeqCst),
-                            inner.dist_cnt.load(Ordering::SeqCst),
-                            config().deadlock_timeout,
-                            std::backtrace::Backtrace::capture()
-                        );
-                        timer = std::time::Instant::now();
-                    }
-                    async_std::task::yield_now().await;
+                if !(Self::wait_on_state(
+                    inner.clone(),
+                    mode_refs,
+                    DarcMode::RestartDrop as u8,
+                    extra_cnt,
+                    false,
+                )
+                .await)
+                {
+                    panic!("deadlock");
                 }
+                Self::broadcast_state(inner.clone(), team.clone(), mode_refs, orig_state);
+                // team.scheduler.submit_task(async move {
+                Box::pin(DarcInner::block_on_outstanding(
+                    inner.clone(),
+                    state,
+                    extra_cnt,
+                ))
+                .await;
+                // });
+                return;
             }
 
             // self.debug_print();
@@ -698,7 +787,7 @@ impl<T> DarcInner<T> {
     // }
 }
 
-impl<T> fmt::Debug for DarcInner<T> {
+impl<T: 'static> fmt::Debug for DarcInner<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         write!(
             f,
@@ -1249,7 +1338,7 @@ macro_rules! local_mode {
             (*(((&mut $mode_refs[$inner.my_pe]) as *mut DarcMode) as *mut AtomicU8))
                 .compare_exchange(
                     $mode as u8,
-                    DarcMode::Dropped as u8,
+                    DarcMode::Dropping as u8,
                     Ordering::SeqCst,
                     Ordering::SeqCst,
                 )
@@ -1262,6 +1351,17 @@ macro_rules! launch_drop {
     ($mode:ty, $inner:ident, $inner_addr:expr) => {
         // println!("launching drop task as {}", stringify!($mode));
         let team = $inner.team();
+        let mode_refs =
+            unsafe { std::slice::from_raw_parts_mut($inner.mode_addr as *mut u8, $inner.num_pes) };
+        let rdma = &team.lamellae;
+        for pe in team.arch.team_iter() {
+            // println!("darc block_on_outstanding put 3");
+            rdma.put(
+                pe,
+                &mode_refs[$inner.my_pe..=$inner.my_pe],
+                $inner.mode_addr + $inner.my_pe * std::mem::size_of::<DarcMode>(),
+            );
+        }
         // team.print_cnt();
         team.exec_am_local(DroppedWaitAM {
             inner_addr: $inner_addr as *const u8 as usize,
@@ -1355,15 +1455,15 @@ unsafe impl<T> Sync for DroppedWaitAM<T> {}
 pub(crate) struct WrappedInner<T> {
     inner: NonNull<DarcInner<T>>,
 }
-unsafe impl<T> Send for WrappedInner<T> {}
+unsafe impl<T: 'static> Send for WrappedInner<T> {}
 
-impl<T> Clone for WrappedInner<T> {
+impl<T: 'static> Clone for WrappedInner<T> {
     fn clone(&self) -> Self {
         WrappedInner { inner: self.inner }
     }
 }
 
-impl<T> std::fmt::Debug for WrappedInner<T> {
+impl<T: 'static> std::fmt::Debug for WrappedInner<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "WrappedInner {{ inner: {:?} }}", unsafe {
             self.inner.as_ref()
@@ -1371,7 +1471,7 @@ impl<T> std::fmt::Debug for WrappedInner<T> {
     }
 }
 
-impl<T> std::ops::Deref for WrappedInner<T> {
+impl<T: 'static> std::ops::Deref for WrappedInner<T> {
     type Target = DarcInner<T>;
     fn deref(&self) -> &Self::Target {
         unsafe { self.inner.as_ref() }

From 706dc6fdc0e06bcea13a127ab9debb5add45b3a2 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 16 May 2024 12:04:24 -0700
Subject: [PATCH 035/116] temporarily expose async barrier and flush operations
 for testing

---
 src/lamellar_task_group.rs | 5 ++++-
 src/lamellar_world.rs      | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 18bec1d8..d2783f5f 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -667,7 +667,7 @@ impl LamellarTaskGroup {
         }
     }
 
-    pub(crate) async fn await_all(&self) {
+    pub async fn await_all(&self) {
         let mut temp_now = Instant::now();
         while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
             // self.team.flush();
@@ -1525,6 +1525,9 @@ pub struct TypedAmGroupResultIter<'a, T> {
 impl<'a, T> Iterator for TypedAmGroupResultIter<'a, T> {
     type Item = AmGroupResult<'a, T>;
     fn next(&mut self) -> Option<Self::Item> {
+        if self.index % 10000 == 0 {
+            println!("TypedAmGroupResultIter index: {}", self.index);
+        }
         if self.index < self.results.len() {
             let index = self.index;
             self.index += 1;
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 6a1e9f0c..987c674e 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -209,6 +209,10 @@ impl LamellarWorld {
     pub fn num_threads_per_pe(&self) -> usize {
         self.team.num_threads_per_pe()
     }
+
+    pub fn flush(&self) {
+        self.team_rt.flush();
+    }
 }
 
 impl LamellarEnv for LamellarWorld {
@@ -433,7 +437,7 @@ impl LamellarWorldBuilder {
         //         }
         //     }
         // };
-        println!("executor: {:?}", executor);
+        // println!("executor: {:?}", executor);
 
         let num_threads = config().threads;
         //     let num_threads = match std::env::var("LAMELLAR_THREADS") {

From e6ec87bb17f5eb8df2a994e64e092061a3c60668 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 16 May 2024 12:05:12 -0700
Subject: [PATCH 036/116] expose to_base for onesided memregions (internally)

---
 src/memregion/one_sided.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index 1e268a91..e248e83d 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -691,6 +691,18 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
             false
         }
     }
+
+    pub(crate) unsafe fn to_base<B: Dist>(self) -> OneSidedMemoryRegion<B> {
+        let u8_offset = self.sub_region_offset * std::mem::size_of::<T>();
+        let u8_size = self.sub_region_size * std::mem::size_of::<T>();
+        OneSidedMemoryRegion {
+            mr: self.mr.clone(),
+            pe: self.pe,
+            sub_region_offset: u8_offset / std::mem::size_of::<B>(),
+            sub_region_size: u8_size / std::mem::size_of::<B>(),
+            phantom: PhantomData,
+        }
+    }
 }
 
 // This could be useful for if we want to transfer the actual data instead of the pointer

From cb36c5379e4afd677919dad40ed8cdbd7d6a64e7 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 16 May 2024 12:05:44 -0700
Subject: [PATCH 037/116] add ping_pong example

---
 Cargo.toml                 |   4 +
 examples/misc/ping_pong.rs | 429 +++++++++++++++++++++++++++++++++++++
 2 files changed, 433 insertions(+)
 create mode 100644 examples/misc/ping_pong.rs

diff --git a/Cargo.toml b/Cargo.toml
index fa297f34..678c5091 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -436,6 +436,10 @@ path="examples/misc/simple_ptp.rs"
 name="lamellar_env"
 path="examples/misc/lamellar_env.rs"
 
+[[example]]
+name="ping_pong"
+path="examples/misc/ping_pong.rs"
+
 
 ##------------ Darc examples ------------------##
 [[example]]
diff --git a/examples/misc/ping_pong.rs b/examples/misc/ping_pong.rs
new file mode 100644
index 00000000..6f072490
--- /dev/null
+++ b/examples/misc/ping_pong.rs
@@ -0,0 +1,429 @@
+use std::sync::Arc;
+
+use lamellar::active_messaging::prelude::*;
+use lamellar::memregion::prelude::*;
+// use parking_lot::Mutex;
+use async_lock::{Mutex, Semaphore};
+use rand::prelude::*;
+use std::collections::VecDeque;
+use std::sync::atomic::AtomicU8;
+use std::sync::atomic::Ordering;
+
+const UPDATES_PER_CORE: usize = 10_000_000;
+
+#[derive(Clone)]
+struct IdxAmBuffer {
+    idx_send_buffer: SharedMemoryRegion<usize>,
+    idx_recv_buffer: SharedMemoryRegion<usize>,
+    res_recv_buffer: SharedMemoryRegion<usize>,
+}
+
+#[derive(Clone)]
+struct ResAmBuffer {
+    idx_recv_buffer: SharedMemoryRegion<usize>,
+    res_send_buffer: SharedMemoryRegion<usize>,
+    res_recv_buffer: SharedMemoryRegion<usize>,
+}
+
+#[AmLocalData]
+struct RecvAm {
+    buffer: ResAmBuffer,
+    remote_pe: usize,
+    buffer_size: usize,
+    finished: Arc<AtomicU8>,
+}
+
+#[local_am]
+impl LamellarAm for RecvAm {
+    async fn exec(self) {
+        unsafe {
+            let mut cnt = 0;
+
+            let start = self.remote_pe * self.buffer_size;
+            let end = start + self.buffer_size;
+            let my_start = lamellar::current_pe * self.buffer_size;
+            let res_send_buf = self.buffer.res_send_buffer.sub_region(start..end);
+            // let res_send_slice = res_send_buf.as_mut_slice().unwrap();
+            let idx_recv_buf = self.buffer.idx_recv_buffer.sub_region(start..end);
+            let idx_recv_slice = idx_recv_buf.as_mut_slice().unwrap();
+
+            while self.finished.load(Ordering::SeqCst) == 0 {
+                // let mut first = true;
+
+                while idx_recv_slice[self.buffer_size - 1] == usize::MAX {
+                    if self.finished.load(Ordering::SeqCst) != 0 {
+                        break;
+                    }
+                    async_std::task::yield_now().await;
+                }
+                idx_recv_slice[self.buffer_size - 1] = usize::MAX;
+                // for (i, (r, s)) in idx_recv_slice
+                //     .iter_mut()
+                //     .zip(res_send_slice.iter_mut())
+                //     .enumerate()
+                // {
+                //     // let mut timer = std::time::Instant::now();
+                //     while *r == usize::MAX {
+                //         if self.finished.load(Ordering::SeqCst) != 0 {
+                //             break;
+                //         }
+                //         async_std::task::yield_now().await;
+                //         // if timer.elapsed().as_secs_f64() > 1.0 {
+                //         //     if i != 0 {
+                //         //         let s = std::cmp::max(i as isize - 5 as isize, 0isize) as usize;
+                //         //         let e = std::cmp::min(i + 5, end - start);
+                //         //         println!(
+                //         //             "waiting for idx data from: {} at elem {} {} {s}-{e} {:?}",
+                //         //             self.remote_pe,
+                //         //             i,
+                //         //             *r == usize::MAX,
+                //         //             &self.buffer.idx_recv_buffer.as_mut_slice().unwrap()[s..e]
+                //         //         );
+                //         //     }
+                //         //     timer = std::time::Instant::now();
+                //         // }
+                //     }
+                //     // if first {
+                //     //     first = false;
+                //     //     println!(
+                //     //         "recived something from: {} {} {}",
+                //     //         self.remote_pe,
+                //     //         *r,
+                //     //         start / self.buffer_size
+                //     //     );
+                //     // }
+                //     *s = lamellar::current_pe; // data[*r];
+                //     *r = usize::MAX;
+                //     cnt += 1;
+                // }
+                // println!(
+                //     "[pe:{:?}] sending back to: {} at {} {:?} {:?}",
+                //     lamellar::current_pe,
+                //     self.remote_pe,
+                //     my_start / self.buffer_size,
+                //     &res_send_buf.as_mut_slice().unwrap()[0..5],
+                //     &res_send_slice[0..5]
+                // );
+                self.buffer
+                    .res_recv_buffer
+                    .put(self.remote_pe, my_start, res_send_buf.clone());
+            }
+            println!("{} recv_cnt: {}", self.remote_pe, cnt);
+        }
+    }
+}
+
+#[AmLocalData]
+struct SendAm {
+    indices: Vec<usize>,
+    buffers: Arc<Vec<Mutex<VecDeque<IdxAmBuffer>>>>,
+    remote_pe: usize,
+    buffer_size: usize,
+    comm_lock: Arc<Semaphore>,
+}
+
+#[local_am]
+impl LamellarAm for SendAm {
+    async fn exec(self) {
+        let mut buffers: Option<IdxAmBuffer> = None;
+
+        {
+            while buffers.is_none() {
+                buffers = self.buffers[self.remote_pe].lock().await.pop_front();
+                async_std::task::yield_now().await;
+
+                // match &mut lock {
+                //     None => {
+                //         async_std::task::yield_now().await;
+                //         lock = self.buffers[self.remote_pe].try_lock();
+                //     }
+                //     Some(lock) => {
+                //         buffers = lock.pop_front();
+                //     }
+                // }
+            }
+        }
+        let buffer = buffers.unwrap();
+        let start = self.remote_pe * self.buffer_size;
+        let end = start + self.buffer_size;
+        let my_start = lamellar::current_pe * self.buffer_size;
+
+        unsafe {
+            std::ptr::copy_nonoverlapping(
+                self.indices.as_ptr(),
+                buffer
+                    .idx_send_buffer
+                    .sub_region(start..end)
+                    .as_mut_ptr()
+                    .unwrap(),
+                self.indices.len(),
+            );
+            // println!(
+            //     "sending to {} at {}",
+            //     self.remote_pe,
+            //     my_start / self.buffer_size
+            // );
+            let _comm = self.comm_lock.acquire().await;
+            buffer.idx_recv_buffer.put(
+                self.remote_pe,
+                my_start,
+                buffer.idx_send_buffer.sub_region(start..end),
+            );
+
+            while buffer
+                .res_recv_buffer
+                .sub_region(start..end)
+                .as_mut_slice()
+                .unwrap()[self.buffer_size - 1]
+                == usize::MAX
+            {
+                async_std::task::yield_now().await;
+            }
+            buffer
+                .res_recv_buffer
+                .sub_region(start..end)
+                .as_mut_slice()
+                .unwrap()[self.buffer_size - 1] = usize::MAX;
+            // for _i in 0..self.indices.len() {
+            // let mut first = true;
+            // for (i, elem) in buffer
+            //     .res_recv_buffer
+            //     .sub_region(start..end)
+            //     .as_mut_slice()
+            //     .unwrap()
+            //     .iter_mut()
+            //     .enumerate()
+            // {
+            //     // let mut timer = std::time::Instant::now();
+            //     while *elem == usize::MAX {
+            //         async_std::task::yield_now().await;
+            //         // if timer.elapsed().as_secs_f64() > 1.0 {
+            //         //     let s = std::cmp::max(i as isize - 5 as isize, 0isize) as usize;
+            //         //     let e = std::cmp::min(i + 5, end - start);
+            //         //     for pe in 0..lamellar::num_pes {
+            //         //         let pe_start = pe * self.buffer_size;
+            //         //         let pe_end = pe_start + self.buffer_size;
+            //         //         println!(
+            //         //             "waiting for response data from: {} at elem {} {s}-{e} [pe:{pe}] {:?}",
+            //         //             self.remote_pe,
+            //         //             i,
+            //         //             &buffer
+            //         //                 .res_recv_buffer
+            //         //                 .sub_region(pe_start..pe_end)
+            //         //                 .as_mut_slice()
+            //         //                 .unwrap()[s..e]
+            //         //         );
+            //         //     }
+            //         //     timer = std::time::Instant::now();
+            //         // }
+            //     }
+            //     // if first {
+            //     //     first = false;
+            //     //     println!(
+            //     //         "recived response from: {} {} at {}",
+            //     //         self.remote_pe,
+            //     //         *elem,
+            //     //         start / self.buffer_size
+            //     //     );
+            //     // }
+            //     *elem = usize::MAX;
+            // }
+            // // for elem in buffer
+            // //     .res_recv_buffer
+            // //     .sub_region(start..end)
+            // //     .as_mut_slice()
+            // //     .unwrap()
+            // //     .iter_mut()
+            // // {
+            // //     *elem = usize::MAX;
+            // // }
+            // println!("response back from {}", self.remote_pe);
+            // // }
+            // // let mut lock = self.buffers[self.remote_pe].try_lock();
+            self.buffers[self.remote_pe].lock().await.push_back(buffer);
+            // println!("Done with send {}", self.remote_pe);
+        }
+    }
+}
+
+#[AmLocalData]
+struct MyAm {
+    indices: SharedMemoryRegion<usize>,
+    buffers: Arc<Vec<Mutex<VecDeque<IdxAmBuffer>>>>,
+    buffer_size: usize,
+    table_size_per_pe: usize,
+    comm_lock: Arc<Semaphore>,
+}
+
+#[local_am]
+impl LamellarAm for MyAm {
+    async fn exec(self) {
+        // let timer = std::time::Instant::now();
+        let indices_slice = unsafe { self.indices.as_mut_slice().unwrap() };
+
+        println!("my_am: {:?} {:?}", indices_slice.len(), self.buffer_size);
+
+        // let my_pe = lamellar::current_pe;
+        let num_pes = lamellar::num_pes;
+        let mut pe_bufs = vec![vec![]; num_pes];
+        let timer = std::time::Instant::now();
+        let mut cnt = 0;
+        let task_group = LamellarTaskGroup::new(lamellar::team.clone());
+        // let mut reqs = Vec::new();
+        // for _i in 0..num_pes {
+        //     reqs.push(VecDeque::new());
+        // }
+        for i in indices_slice.iter() {
+            let pe = i / self.table_size_per_pe;
+            let offset = lamellar::current_pe; //i % self.table_size_per_pe;
+            pe_bufs[pe].push(offset);
+            if pe_bufs[pe].len() > self.buffer_size {
+                let mut indices = vec![];
+                std::mem::swap(&mut indices, &mut pe_bufs[pe]);
+                // if reqs[pe].len() > 10 {
+                //     if let Some(req) = reqs[pe].pop_front() {
+                //         // println!("need to wait for pe: {}", pe);
+                //         req.await;
+                //     }
+                // }
+                // if pe != lamellar::current_pe {
+                // reqs[pe].push_back(lamellar::world.exec_am_local(SendAm {
+                //     indices,
+                //     buffers: self.buffers.clone(),
+                //     remote_pe: pe,
+                //     buffer_size: self.buffer_size,
+                //     comm_lock: self.comm_lock.clone(),
+                // }));
+                task_group.exec_am_local(SendAm {
+                    indices,
+                    buffers: self.buffers.clone(),
+                    remote_pe: pe,
+                    buffer_size: self.buffer_size,
+                    comm_lock: self.comm_lock.clone(),
+                });
+                cnt += 1;
+                // }
+            }
+        }
+        println!("launch time {:?} {:?}", cnt, timer.elapsed());
+        // for (pe, bufs) in pe_bufs.iter().enumerate() {
+        //     // println!("pe: {}, cnt: {}", pe, bufs.len());
+        //     while let Some(req) = reqs[pe].pop_front() {
+        //         req.await;
+        //     }
+        // }
+        task_group.await_all().await;
+        println!("cnt: {} {:?}", cnt, timer.elapsed());
+    }
+}
+
+fn main() {
+    let world = lamellar::LamellarWorldBuilder::new().build();
+    let my_pe = world.my_pe();
+    let num_pes = world.num_pes();
+
+    let buffer_size = 16384 * 2;
+
+    let indices =
+        world.alloc_shared_mem_region::<usize>(UPDATES_PER_CORE * world.num_threads_per_pe());
+
+    let index_send_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    world.barrier();
+    let index_recv_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    world.barrier();
+    let result_send_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    world.barrier();
+    let result_recv_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    world.barrier();
+    let mut rng: StdRng = SeedableRng::seed_from_u64(my_pe as u64);
+    let table_size_per_pe = 100000 * world.num_threads_per_pe();
+    let global_size = table_size_per_pe * num_pes;
+
+    unsafe {
+        index_send_buffers
+            .as_mut_slice()
+            .unwrap()
+            .iter_mut()
+            .for_each(|x| *x = usize::MAX);
+        index_recv_buffers
+            .as_mut_slice()
+            .unwrap()
+            .iter_mut()
+            .for_each(|x| *x = usize::MAX);
+        result_send_buffers
+            .as_mut_slice()
+            .unwrap()
+            .iter_mut()
+            .for_each(|x| *x = my_pe);
+        result_recv_buffers
+            .as_mut_slice()
+            .unwrap()
+            .iter_mut()
+            .for_each(|x| *x = usize::MAX);
+        indices
+            .as_mut_slice()
+            .unwrap()
+            .iter_mut()
+            .for_each(|x| *x = rng.gen_range(0..global_size));
+    }
+    world.barrier();
+
+    let mut res_am_buffers = Vec::new();
+    let mut send_am_buffers = Vec::new();
+
+    for i in 0..num_pes {
+        let mut pe_buffer = VecDeque::new();
+        let idx_buffers = IdxAmBuffer {
+            idx_send_buffer: index_send_buffers.clone(),
+            idx_recv_buffer: index_recv_buffers.clone(),
+            res_recv_buffer: result_recv_buffers.clone(),
+        };
+        pe_buffer.push_back(idx_buffers);
+        send_am_buffers.push(Mutex::new(pe_buffer));
+
+        let res_buffers = ResAmBuffer {
+            idx_recv_buffer: index_recv_buffers.clone(),
+            res_send_buffer: result_send_buffers.clone(),
+            res_recv_buffer: result_recv_buffers.clone(),
+        };
+        res_am_buffers.push(res_buffers.clone());
+    }
+    let buffers = Arc::new(send_am_buffers);
+    let finished = Arc::new(AtomicU8::new(0));
+    let comm_lock = Arc::new(Semaphore::new(1024));
+    world.barrier();
+    let timer = std::time::Instant::now();
+    for (pe, buffer) in res_am_buffers.iter().enumerate() {
+        world.exec_am_local(RecvAm {
+            buffer: buffer.clone(),
+            remote_pe: pe,
+            finished: finished.clone(),
+            buffer_size,
+        });
+    }
+    let mut reqs = vec![];
+    // if my_pe == 0 {
+    for thread in 0..1 {
+        //world.num_threads_per_pe() {
+        reqs.push(world.exec_am_local(MyAm {
+            indices: indices.clone(),
+            buffers: buffers.clone(),
+            buffer_size,
+            table_size_per_pe: table_size_per_pe,
+            comm_lock: comm_lock.clone(),
+        }));
+    }
+    for req in reqs {
+        world.block_on(req);
+    }
+    // }
+    world.barrier();
+    println!(
+        "time {:?} {:?} total updates MUPS: {:?}",
+        timer.elapsed(),
+        indices.len() * num_pes,
+        ((indices.len() * num_pes) as f64 / 1000000.0) / timer.elapsed().as_secs_f64()
+    );
+    finished.store(1, Ordering::SeqCst);
+    // unsafe { println!("{:?}", recv_buffer.as_slice().unwrap()) };
+}

From 5d04ef40025336e93fdf97ab6d482aa2e070ea9c Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Sat, 25 May 2024 15:59:03 -0700
Subject: [PATCH 038/116] use approriate APIs to handle debug version asserts
 for low level communication

---
 examples/hello_world/hello_world_am.rs     |   3 +
 examples/team_examples/custom_team_arch.rs |   2 +-
 lamellar_run.sh                            |   2 +-
 run_examples.sh                            |   2 +
 src/array.rs                               |   4 +-
 src/array/unsafe.rs                        |  66 ++++++---
 src/darc.rs                                | 162 ++++++++++++++-------
 src/lamellae.rs                            |  16 +-
 src/lamellae/command_queues.rs             |  43 ++++--
 src/lamellae/rofi/rofi_comm.rs             |  79 +++++-----
 src/lamellar_alloc.rs                      |  13 +-
 src/memregion.rs                           |  29 ++--
 src/memregion/one_sided.rs                 |   7 +-
 src/memregion/shared.rs                    |  14 +-
 src/scheduler.rs                           |   1 +
 15 files changed, 276 insertions(+), 167 deletions(-)

diff --git a/examples/hello_world/hello_world_am.rs b/examples/hello_world/hello_world_am.rs
index 1e3ea685..301a5ef6 100644
--- a/examples/hello_world/hello_world_am.rs
+++ b/examples/hello_world/hello_world_am.rs
@@ -37,3 +37,6 @@ fn main() {
     //wait for the request to complete
     world.block_on(request);
 } //when world drops there is an implicit world.barrier() that occurs
+
+
+    
\ No newline at end of file
diff --git a/examples/team_examples/custom_team_arch.rs b/examples/team_examples/custom_team_arch.rs
index 6879b99e..72d3c00f 100644
--- a/examples/team_examples/custom_team_arch.rs
+++ b/examples/team_examples/custom_team_arch.rs
@@ -79,7 +79,7 @@ impl LamellarArch for BlockStridedArch {
         let block = parent_pe / self.block_size;
         let start_block = self.start_pe / self.block_size;
         let remainder = parent_pe % self.block_size;
-        if (block - start_block) % self.stride == 0
+        if block >= start_block && (block - start_block) % self.stride == 0
             && self.start_pe <= *parent_pe
             && *parent_pe <= self.end_pe
         {
diff --git a/lamellar_run.sh b/lamellar_run.sh
index 1bb295f0..5f3af138 100755
--- a/lamellar_run.sh
+++ b/lamellar_run.sh
@@ -34,7 +34,7 @@ for pe in $(seq 0 $ENDPE); do
     echo "more threads ${E_CORE} than cores ${NPROC} "
     exit
   fi
-  LAMELLAE_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out & 
+  LAMELLAR_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out.txt & 
   S_CORE=$(($E_CORE ))
   E_CORE=$(($S_CORE + $THREADS))
 done
diff --git a/run_examples.sh b/run_examples.sh
index 2147f82f..bfdeded9 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -66,6 +66,8 @@ root=$PWD
 #  cd ..
 # done
 
+cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
+
 local_results_dir=async_backends
 results_dir=${output_dir}/rofiverbs_lamellae/${local_results_dir}
 ### test using rofi verbs lamellae
diff --git a/src/array.rs b/src/array.rs
index 35276813..6406d68a 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -198,11 +198,13 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
+// impl Dist for bool {}
 // lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
 // lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
-// impl Dist for bool {}
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
 lamellar_impl::generate_reductions_for_type_rt!(false, u128);
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index f9753806..bb182938 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -157,17 +157,33 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         if remaining_elems > 0 {
             per_pe_size += 1
         }
-        // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
-        let rmr = MemoryRegion::new(
-            per_pe_size * std::mem::size_of::<T>(),
-            team.lamellae.clone(),
-            AllocationType::Global,
-        );
+        // println!("new unsafe array {:?} {:?}", elem_per_pe, per_pe_size);
+        let rmr_t: MemoryRegion<T> =
+            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
+        // let rmr = MemoryRegion::new(
+        //     per_pe_size * std::mem::size_of::<T>(),
+        //     team.lamellae.clone(),
+        //     AllocationType::Global,
+        // );
+        // println!("new array {:?}",rmr_t.as_ptr());
+        
         unsafe {
-            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
-                *elem = 0;
-            }
+            // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
+            //     *elem = std::mem::zeroed();
+            // }
+            if std::mem::needs_drop::<T>() {
+                // If `T` needs to be dropped then we have to do this one item at a time, in
+                // case one of the intermediate drops does a panic.
+                // slice.iter_mut().for_each(write_zeroes);
+                panic!("need drop not yet supported");
+              } else {
+                // Otherwise we can be really fast and just fill everthing with zeros.
+                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
+                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
+              }
         }
+        let rmr = unsafe { rmr_t.to_base::<u8>() };
+        // println!("new array u8 {:?}",rmr.as_ptr());
 
         let data = Darc::try_new_with_drop(
             team.clone(),
@@ -235,16 +251,30 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         if remaining_elems > 0 {
             per_pe_size += 1
         }
-        let rmr = MemoryRegion::new(
-            per_pe_size * std::mem::size_of::<T>(),
-            team.lamellae.clone(),
-            AllocationType::Global,
-        );
+        let rmr_t: MemoryRegion<T> =
+            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
+        // let rmr = MemoryRegion::new(
+        //     per_pe_size * std::mem::size_of::<T>(),
+        //     team.lamellae.clone(),
+        //     AllocationType::Global,
+        // );
+        
         unsafe {
-            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
-                *elem = 0;
-            }
+             // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
+            //     *elem = std::mem::zeroed();
+            // }
+            if std::mem::needs_drop::<T>() {
+                // If `T` needs to be dropped then we have to do this one item at a time, in
+                // case one of the intermediate drops does a panic.
+                // slice.iter_mut().for_each(write_zeroes);
+                panic!("need drop not yet supported");
+              } else {
+                // Otherwise we can be really fast and just fill everthing with zeros.
+                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
+                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
+              }
         }
+        let rmr = unsafe { rmr_t.to_base::<u8>() };
 
         let data = Darc::try_new_with_drop(
             team.clone(),
@@ -427,6 +457,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     pub(crate) fn local_as_mut_ptr(&self) -> *mut T {
         let u8_ptr = unsafe { self.inner.local_as_mut_ptr() };
         // self.inner.data.mem_region.as_casted_mut_ptr::<T>().unwrap();
+        // println!("ptr: {:?} {:?}", u8_ptr, u8_ptr as *const T);
         u8_ptr as *mut T
     }
 
@@ -1811,6 +1842,7 @@ impl UnsafeArrayInner {
             self.data.mem_region.as_casted_mut_ptr::<u8>().expect(
                 "memory doesnt exist on this pe (this should not happen for arrays currently)",
             );
+        // println!("u8 ptr: {:?}", ptr);
         // let len = self.size;
         let my_pe = self.data.my_pe;
         let num_pes = self.data.num_pes;
diff --git a/src/darc.rs b/src/darc.rs
index 7c5c0183..b2614497 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -912,6 +912,15 @@ impl<T> Darc<T> {
     }
 }
 
+fn calc_padding(addr: usize, align: usize) -> usize {
+    let rem = addr % align;
+    if rem == 0 {
+        0
+    } else {
+        align - rem
+    }
+}
+
 impl<T> Darc<T> {
     #[doc(alias = "Collective")]
     /// Constructs a new `Darc<T>` on the PEs specified by team.
@@ -967,12 +976,34 @@ impl<T> Darc<T> {
             AllocationType::Sub(team_rt.get_pes())
         };
 
-        let size = std::mem::size_of::<DarcInner<T>>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>();
+        //The DarcInner data structure
+        let mut size = std::mem::size_of::<DarcInner<T>>();
+
+        // Ref Cnt Array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // total ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let total_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // mode array
+        let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
+        let mode_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
+
+        //mode ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        //mode_barrier array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_barrier_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
         // println!("creating new darc");
 
         team_rt.async_barrier().await;
@@ -1008,25 +1039,30 @@ impl<T> Darc<T> {
             weak_local_cnt: AtomicUsize::new(0),
             dist_cnt: AtomicUsize::new(0),
             total_dist_cnt: AtomicUsize::new(0),
-            ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
-            total_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
-            mode_barrier_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+            // total_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+            // mode_barrier_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            ref_cnt_addr: addr + ref_cnt_offset,
+            total_ref_cnt_addr: addr + total_ref_cnt_offset,
+            mode_addr: addr + mode_offset,
+            mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
+            mode_barrier_addr: addr + mode_barrier_offset,
             barrier: barrier_ptr,
             // mode_barrier_rounds: num_rounds,
             am_counters: am_counters_ptr,
@@ -1082,12 +1118,33 @@ impl<T> Darc<T> {
             AllocationType::Sub(team_rt.get_pes())
         };
 
-        let size = std::mem::size_of::<DarcInner<T>>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>();
+        //The DarcInner data structure
+        let mut size = std::mem::size_of::<DarcInner<T>>();
+
+        // Ref Cnt Array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // total ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let total_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // mode array
+        let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
+        let mode_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
+
+        //mode ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        //mode_barrier array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_barrier_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
         // println!("creating new darc");
 
         team_rt.tasking_barrier();
@@ -1123,25 +1180,30 @@ impl<T> Darc<T> {
             weak_local_cnt: AtomicUsize::new(0),
             dist_cnt: AtomicUsize::new(0),
             total_dist_cnt: AtomicUsize::new(0),
-            ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
-            total_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
-            mode_barrier_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+            // total_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+            // mode_barrier_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            ref_cnt_addr: addr + ref_cnt_offset,
+            total_ref_cnt_addr: addr + total_ref_cnt_offset,
+            mode_addr: addr + mode_offset,
+            mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
+            mode_barrier_addr: addr + mode_barrier_offset,
             barrier: barrier_ptr,
             // mode_barrier_rounds: num_rounds,
             am_counters: am_counters_ptr,
diff --git a/src/lamellae.rs b/src/lamellae.rs
index 69ce6241..2c7e249c 100755
--- a/src/lamellae.rs
+++ b/src/lamellae.rs
@@ -228,14 +228,16 @@ pub(crate) fn create_lamellae(backend: Backend) -> LamellaeBuilder {
         #[cfg(feature = "enable-rofi")]
         Backend::Rofi => {
             let provider = match std::env::var("LAMELLAR_ROFI_PROVIDER") {
-                Ok(p) => match p.as_str() {
-                    "verbs" => "verbs",
-                    "shm" => "shm",
-                    _ => "verbs",
-                },
-                Err(_) => "verbs",
+                Ok(p) => p,
+                Err(_) => "verbs".to_owned(),
+                // Ok(p) => match p.as_str() {
+                //     "verbs" => "verbs",
+                //     "tcp" => "tcp",
+                //     _ => "verbs",
+                // },
+                // Err(_) => "verbs",
             };
-            LamellaeBuilder::RofiBuilder(RofiBuilder::new(provider))
+            LamellaeBuilder::RofiBuilder(RofiBuilder::new(&provider))
         }
         Backend::Shmem => LamellaeBuilder::ShmemBuilder(ShmemBuilder::new()),
         Backend::Local => LamellaeBuilder::Local(Local::new()),
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index f86640d9..ac32e1f6 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -16,7 +16,7 @@ use std::sync::Arc;
 //use tracing::*;
 
 // const CMD_BUF_LEN: usize = 50000; // this is the number of slots for each PE
-                                  // const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
+// const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
 const CMD_BUFS_PER_PE: usize = 2;
 
 // lazy_static! {
@@ -62,24 +62,35 @@ enum Cmd {
 //#[tracing::instrument(skip_all)]
 fn calc_hash(addr: usize, len: usize) -> usize {
     //we split into a u64 slice and a u8 slice as u64 seems to compute faster.
-    let num_u64s = len / std::mem::size_of::<u64>();
-    let u64_slice = unsafe { std::slice::from_raw_parts(addr as *const u64, num_u64s) };
-    let num_u8s = len % std::mem::size_of::<u64>();
+    let num_usizes = len / std::mem::size_of::<usize>();
+    //let u64_slice = unsafe { std::slice::from_raw_parts(addr as *const u64, num_u64s) };
+    let num_u8s = len % std::mem::size_of::<usize>();
     let u8_slice = unsafe {
         std::slice::from_raw_parts(
-            (addr + num_u64s * std::mem::size_of::<u64>()) as *const u8,
+            (addr + num_usizes * std::mem::size_of::<usize>()) as *const u8,
             num_u8s,
         )
     };
-    (u64_slice
-        .iter()
-        .map(|x| Wrapping(*x as usize))
+    ((0..num_usizes)
+        .map(|x| unsafe { Wrapping((addr as *const usize).offset(x as isize).read_unaligned()) })
         .sum::<Wrapping<usize>>()
         + u8_slice
             .iter()
             .map(|x| Wrapping(*x as usize))
             .sum::<Wrapping<usize>>())
     .0
+
+    // let u8_slice = unsafe {
+    //     std::slice::from_raw_parts(
+    //         (addr) as *const u8,
+    //         num_u8s,
+    //     )
+    // };
+    // u8_slice
+    //     .iter()
+    //     .map(|x| Wrapping(*x as usize))
+    //     .sum::<Wrapping<usize>>()
+    //     .0
 }
 
 impl CmdMsg {
@@ -1065,10 +1076,13 @@ impl InnerCQ {
         }
         let ser_data = ser_data.unwrap();
         // if print{
-        //     ser_data.print();
+        // ser_data.print();
         // }
         self.get_serialized_data(src, cmd, &ser_data).await;
-        // println!("received data {:?}",ser_data.header_and_data_as_bytes()[0..10]);
+        // println!(
+        //     "received data {:?}",
+        //     &ser_data.header_and_data_as_bytes()[0..10]
+        // );
         self.recv_cnt.fetch_add(1, Ordering::SeqCst);
         // println!("received: {:?} {:?} cmd: {:?} {:?}",cmd.dsize,ser_data.len(),cmd,&ser_data.header_and_data_as_bytes()[0..20]);
         // SerializedData::RofiData(ser_data)
@@ -1079,7 +1093,7 @@ impl InnerCQ {
     async fn get_cmd_buf(&self, src: usize, cmd: CmdMsg) -> usize {
         let mut data = self
             .comm
-            .rt_alloc(cmd.dsize as usize, std::mem::align_of::<u8>());
+            .rt_alloc(cmd.dsize as usize, std::mem::align_of::<CmdMsg>());
         let mut timer = std::time::Instant::now();
         while data.is_err() && self.active.load(Ordering::SeqCst) != CmdQStatus::Panic as u8 {
             async_std::task::yield_now().await;
@@ -1087,7 +1101,7 @@ impl InnerCQ {
             self.send_alloc(cmd.dsize);
             data = self
                 .comm
-                .rt_alloc(cmd.dsize as usize, std::mem::align_of::<u8>());
+                .rt_alloc(cmd.dsize as usize, std::mem::align_of::<CmdMsg>());
             // println!("cq 874 data {:?}",data.is_ok());
             if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!("get cmd buf stuck waiting for alloc");
@@ -1328,7 +1342,8 @@ impl CommandQueue {
                 //     &data.header_and_data_as_bytes()[0..20]
                 // );
                 data.increment_cnt(); //or we could implement something like an into_raw here...
-                                      // println!("sending data {:?}",data.header_and_data_as_bytes());
+                                      // println!("sending data {:?}", data.header_and_data_as_bytes());
+
                 self.cq.send(data.relative_addr, data.len, dst, hash).await;
             }
             SerializedData::ShmemData(ref data) => {
@@ -1516,7 +1531,7 @@ impl CommandQueue {
 
     //#[tracing::instrument(skip_all)]
     pub fn mem_per_pe() -> usize {
-        (config().cmd_buf_len * config().cmd_buf_cnt  + 4) * std::mem::size_of::<CmdMsg>()
+        (config().cmd_buf_len * config().cmd_buf_cnt + 4) * std::mem::size_of::<CmdMsg>()
     }
 }
 
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index ab2fc34c..ff847f6c 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -94,13 +94,10 @@ impl RofiComm {
 
     //#[tracing::instrument(skip_all)]
     unsafe fn fill_buffer<R: Copy, T>(&self, dst_addr: &mut [T], val: R) {
-        // println!("{:?} {:?} {:?} {:?} {:?}",std::mem::size_of::<T>(),std::mem::size_of::<R>(),(dst_addr.len()*std::mem::size_of::<T>()),(dst_addr.len()*std::mem::size_of::<T>())/std::mem::size_of::<R>(),(dst_addr.len()*std::mem::size_of::<T>())%std::mem::size_of::<R>());
-        let bytes = std::slice::from_raw_parts_mut(
-            dst_addr.as_ptr() as *mut T as *mut R,
-            (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>(),
-        );
-        for elem in bytes.iter_mut() {
-            *elem = val;
+        let num_r = (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>();
+        let r_ptr = dst_addr.as_ptr() as *mut T as *mut R;
+        for i in 0..num_r {
+            r_ptr.offset(i as isize).write_unaligned(val);
         }
     }
     //#[tracing::instrument(skip_all)]
@@ -120,24 +117,26 @@ impl RofiComm {
         }
     }
     //#[tracing::instrument(skip_all)]
-    unsafe fn check_buffer_elems<R: std::cmp::PartialEq, T>(
+    unsafe fn check_buffer_elems<R: std::cmp::PartialEq + std::fmt::Debug, T>(
         &self,
         dst_addr: &mut [T],
         val: R,
     ) -> TxResult<()> {
-        let bytes = std::slice::from_raw_parts_mut(
-            dst_addr.as_ptr() as *mut T as *mut R,
-            (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>(),
-        );
+        let num_r = (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>();
+        let r_ptr = dst_addr.as_ptr() as *mut T as *mut R;
+
         let mut timer = std::time::Instant::now();
-        for i in 0..(bytes.len() as isize - 2) {
-            while bytes[i as usize] == val && bytes[i as usize + 1] == val {
+        for i in 0..num_r - 2 {
+            while r_ptr.offset(i as isize).read_unaligned() == val
+                && r_ptr.offset(i as isize + 1).read_unaligned() == val
+            {
                 if timer.elapsed().as_secs_f64() > 1.0 {
                     // println!(
-                    //     "{:?}: {:?} {:?} {:?}",
+                    //     "{:?}/{:?}: {:?} {:?} {:?}",
                     //     i,
-                    //     bytes[i as usize],
-                    //     bytes[i as usize + 1],
+                    //     num_r,
+                    //     r_ptr.offset(i as isize).read_unaligned(),
+                    //     r_ptr.offset(i as isize + 1).read_unaligned(),
                     //     val
                     // );
                     return Err(TxError::GetError);
@@ -145,9 +144,10 @@ impl RofiComm {
                 //hopefully magic number doesnt appear twice in a row
                 std::thread::yield_now();
             }
+            timer = std::time::Instant::now();
         }
         timer = std::time::Instant::now();
-        while bytes[bytes.len() - 1] == val {
+        while r_ptr.offset(num_r as isize - 1).read_unaligned() == val {
             if timer.elapsed().as_secs_f64() > 1.0 {
                 // println!("{:?}", bytes[bytes.len() - 1]);
                 return Err(TxError::GetError);
@@ -263,6 +263,7 @@ impl CommOps for RofiComm {
     }
     //#[tracing::instrument(skip_all)]
     fn rt_alloc(&self, size: usize, align: usize) -> AllocResult<usize> {
+        // println!("rt_alloc size {size} align {align}");
         // let size = size + size%8;
         let allocs = self.alloc.read();
         for alloc in allocs.iter() {
@@ -360,12 +361,16 @@ impl CommOps for RofiComm {
             // }
         } else {
             unsafe {
-                // println!("[{:?}]-({:?}) memcopy {:?} into {:x}",pe,src_addr.as_ptr(),src_addr.len(),dst_addr);
-                std::ptr::copy_nonoverlapping(
-                    src_addr.as_ptr(),
-                    dst_addr as *mut T,
-                    src_addr.len(),
-                );
+                // println!(
+                //     "[{:?}]-({:?}) memcopy {:?} into {:x} src align {:?} dst align {:?}",
+                //     pe,
+                //     src_addr.as_ptr(),
+                //     src_addr.len(),
+                //     dst_addr,
+                //     src_addr.as_ptr().align_offset(std::mem::align_of::<T>()),
+                //     (dst_addr as *mut T).align_offset(std::mem::align_of::<T>()),
+                // );
+                std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
             }
         }
         // req
@@ -394,11 +399,7 @@ impl CommOps for RofiComm {
         } else {
             unsafe {
                 // println!("[{:?}]-({:?}) memcopy {:?}",pe,src_addr.as_ptr());
-                std::ptr::copy_nonoverlapping(
-                    src_addr.as_ptr(),
-                    dst_addr as *mut T,
-                    src_addr.len(),
-                );
+                std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
             }
         }
         // req
@@ -431,7 +432,7 @@ impl CommOps for RofiComm {
         }
         // drop(lock);
         unsafe {
-            std::ptr::copy_nonoverlapping(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
+            std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
         }
         self.put_amt.fetch_add(
             src_addr.len() * (self.num_pes - 1) * std::mem::size_of::<T>(),
@@ -487,11 +488,7 @@ impl CommOps for RofiComm {
         } else {
             // println!("[{:?}]-{:?} {:?} {:?}",self.my_pe,src_addr as *const T,dst_addr.as_mut_ptr(),dst_addr.len());
             unsafe {
-                std::ptr::copy_nonoverlapping(
-                    src_addr as *const T,
-                    dst_addr.as_mut_ptr(),
-                    dst_addr.len(),
-                );
+                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
             }
         }
         // req
@@ -567,11 +564,7 @@ impl CommOps for RofiComm {
             }
         } else {
             unsafe {
-                std::ptr::copy_nonoverlapping(
-                    src_addr as *const T,
-                    dst_addr.as_mut_ptr(),
-                    dst_addr.len(),
-                );
+                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
             }
         }
     }
@@ -617,11 +610,7 @@ impl CommOps for RofiComm {
             // };
         } else {
             unsafe {
-                std::ptr::copy_nonoverlapping(
-                    src_addr as *const T,
-                    dst_addr.as_mut_ptr(),
-                    dst_addr.len(),
-                );
+                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
             }
         }
         // req
diff --git a/src/lamellar_alloc.rs b/src/lamellar_alloc.rs
index 1a6471ab..8069382a 100644
--- a/src/lamellar_alloc.rs
+++ b/src/lamellar_alloc.rs
@@ -362,8 +362,15 @@ impl LamellarAlloc for BTreeAlloc {
             //     a + padding,
             //     self.free_space.load(Ordering::SeqCst)
             // );
-
-            Some(a + padding)
+            let new_addr = a + padding;
+            // let rem = new_addr % align;
+            // let rem_16 = new_addr % 16;
+            // println!(
+            //     "alloc addr {:x?} {:x?} {new_addr} {a} {padding} {rem} {align} {rem_16}",
+            //     a + padding,
+            //     new_addr,
+            // );
+            Some(new_addr)
         } else {
             None
         };
@@ -390,7 +397,7 @@ impl LamellarAlloc for BTreeAlloc {
     fn free(&self, addr: usize) -> Result<(), usize> {
         let &(ref lock, ref _cvar) = &*self.allocated_addrs;
         let mut allocated_addrs = lock.lock();
-
+        // println!("trying to free: {:x?} {:?}", addr, addr);
         if let Some((size, padding)) = allocated_addrs.remove(&addr) {
             // println!("allocated_addrs: {:?}", allocated_addrs);
             let full_size = size + padding;
diff --git a/src/memregion.rs b/src/memregion.rs
index 1d44a1ab..dfdbfa6e 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -730,8 +730,9 @@ impl<T: Dist> MemoryRegion<T> {
         alloc: AllocationType,
     ) -> Result<MemoryRegion<T>, anyhow::Error> {
         // println!(
-        //     "creating new lamellar memory region {:?}",
-        //     size * std::mem::size_of::<T>()
+        //     "creating new lamellar memory region size: {:?} align: {:?}",
+        //     size * std::mem::size_of::<T>(),
+        //     std::mem::align_of::<T>()
         // );
         let mut mode = Mode::Shared;
         let addr = if size > 0 {
@@ -791,23 +792,25 @@ impl<T: Dist> MemoryRegion<T> {
 
     #[allow(dead_code)]
     //#[tracing::instrument(skip_all)]
-    pub(crate) unsafe fn to_base<B: Dist>(self) -> MemoryRegion<B> {
+    pub(crate) unsafe fn to_base<B: Dist>(mut self) -> MemoryRegion<B> {
         //this is allowed as we consume the old object..
         assert_eq!(
             self.num_bytes % std::mem::size_of::<B>(),
             0,
             "Error converting memregion to new base, does not align"
         );
-        MemoryRegion {
-            addr: self.addr, //TODO: out of memory...
-            pe: self.pe,
-            size: self.num_bytes / std::mem::size_of::<B>(),
-            num_bytes: self.num_bytes,
-            backend: self.backend,
-            rdma: self.rdma.clone(),
-            mode: self.mode,
-            phantom: PhantomData,
-        }
+        // MemoryRegion {
+        //     addr: self.addr, //TODO: out of memory...
+        //     pe: self.pe,
+        //     size: self.num_bytes / std::mem::size_of::<B>(),
+        //     num_bytes: self.num_bytes,
+        //     backend: self.backend,
+        //     rdma: self.rdma.clone(),
+        //     mode: self.mode,
+        //     phantom: PhantomData,
+        // }
+        self.size = self.num_bytes / std::mem::size_of::<B>();
+        std::mem::transmute(self) //we do this because other wise self gets dropped and frees the underlying data (we could also set addr to 0 in self)
     }
 
     // }
diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index e248e83d..dcdedff2 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -355,11 +355,8 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
         team: &std::pin::Pin<Arc<LamellarTeamRT>>,
         lamellae: Arc<Lamellae>,
     ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
-        let mr = MemoryRegion::try_new(
-            size * std::mem::size_of::<T>(),
-            lamellae,
-            AllocationType::Local,
-        )?;
+        let mr_t: MemoryRegion<T> = MemoryRegion::try_new(size, lamellae, AllocationType::Local)?;
+        let mr = unsafe { mr_t.to_base::<u8>() };
         let pe = mr.pe;
 
         let id = ID_COUNTER.fetch_add(1, Ordering::Relaxed);
diff --git a/src/memregion/shared.rs b/src/memregion/shared.rs
index 3390588b..39d975a7 100644
--- a/src/memregion/shared.rs
+++ b/src/memregion/shared.rs
@@ -99,17 +99,11 @@ impl<T: Dist> SharedMemoryRegion<T> {
         alloc: AllocationType,
     ) -> Result<SharedMemoryRegion<T>, anyhow::Error> {
         // println!("creating new shared mem region {:?} {:?}",size,alloc);
+        let mr_t: MemoryRegion<T> = MemoryRegion::try_new(size, team.lamellae.clone(), alloc)?;
+        let mr = unsafe { mr_t.to_base::<u8>() };
         Ok(SharedMemoryRegion {
-            mr: Darc::try_new(
-                team.clone(),
-                MemoryRegion::try_new(
-                    size * std::mem::size_of::<T>(),
-                    team.lamellae.clone(),
-                    alloc,
-                )?,
-                crate::darc::DarcMode::Darc,
-            )
-            .expect("memregions can only be created on a member of the team"),
+            mr: Darc::try_new(team.clone(), mr, crate::darc::DarcMode::Darc)
+                .expect("memregions can only be created on a member of the team"),
             sub_region_offset: 0,
             sub_region_size: size,
             phantom: PhantomData,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 82d95f24..7774654b 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -168,6 +168,7 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
+        // println!("am ptr {:p} ", &am);
         let am_future = async move {
             // let start_tid = thread::current().id();
             num_ams.fetch_add(1, Ordering::Relaxed);

From 3ba2e850a345327132240f1eb5b912745e915e90 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Sat, 25 May 2024 18:26:00 -0700
Subject: [PATCH 039/116] premature commit and push Revert "use approriate APIs
 to handle debug version asserts for low level communication"

This reverts commit 5d04ef40025336e93fdf97ab6d482aa2e070ea9c.
---
 examples/hello_world/hello_world_am.rs     |   3 -
 examples/team_examples/custom_team_arch.rs |   2 +-
 lamellar_run.sh                            |   2 +-
 run_examples.sh                            |   2 -
 src/array.rs                               |   4 +-
 src/array/unsafe.rs                        |  66 +++------
 src/darc.rs                                | 162 +++++++--------------
 src/lamellae.rs                            |  16 +-
 src/lamellae/command_queues.rs             |  43 ++----
 src/lamellae/rofi/rofi_comm.rs             |  79 +++++-----
 src/lamellar_alloc.rs                      |  13 +-
 src/memregion.rs                           |  29 ++--
 src/memregion/one_sided.rs                 |   7 +-
 src/memregion/shared.rs                    |  14 +-
 src/scheduler.rs                           |   1 -
 15 files changed, 167 insertions(+), 276 deletions(-)

diff --git a/examples/hello_world/hello_world_am.rs b/examples/hello_world/hello_world_am.rs
index 301a5ef6..1e3ea685 100644
--- a/examples/hello_world/hello_world_am.rs
+++ b/examples/hello_world/hello_world_am.rs
@@ -37,6 +37,3 @@ fn main() {
     //wait for the request to complete
     world.block_on(request);
 } //when world drops there is an implicit world.barrier() that occurs
-
-
-    
\ No newline at end of file
diff --git a/examples/team_examples/custom_team_arch.rs b/examples/team_examples/custom_team_arch.rs
index 72d3c00f..6879b99e 100644
--- a/examples/team_examples/custom_team_arch.rs
+++ b/examples/team_examples/custom_team_arch.rs
@@ -79,7 +79,7 @@ impl LamellarArch for BlockStridedArch {
         let block = parent_pe / self.block_size;
         let start_block = self.start_pe / self.block_size;
         let remainder = parent_pe % self.block_size;
-        if block >= start_block && (block - start_block) % self.stride == 0
+        if (block - start_block) % self.stride == 0
             && self.start_pe <= *parent_pe
             && *parent_pe <= self.end_pe
         {
diff --git a/lamellar_run.sh b/lamellar_run.sh
index 5f3af138..1bb295f0 100755
--- a/lamellar_run.sh
+++ b/lamellar_run.sh
@@ -34,7 +34,7 @@ for pe in $(seq 0 $ENDPE); do
     echo "more threads ${E_CORE} than cores ${NPROC} "
     exit
   fi
-  LAMELLAR_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out.txt & 
+  LAMELLAE_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out & 
   S_CORE=$(($E_CORE ))
   E_CORE=$(($S_CORE + $THREADS))
 done
diff --git a/run_examples.sh b/run_examples.sh
index bfdeded9..2147f82f 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -66,8 +66,6 @@ root=$PWD
 #  cd ..
 # done
 
-cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
-
 local_results_dir=async_backends
 results_dir=${output_dir}/rofiverbs_lamellae/${local_results_dir}
 ### test using rofi verbs lamellae
diff --git a/src/array.rs b/src/array.rs
index 6406d68a..35276813 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -198,13 +198,11 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
-// impl Dist for bool {}
 // lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
 // lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
-// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// impl Dist for bool {}
 
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
 lamellar_impl::generate_reductions_for_type_rt!(false, u128);
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index bb182938..f9753806 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -157,33 +157,17 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         if remaining_elems > 0 {
             per_pe_size += 1
         }
-        // println!("new unsafe array {:?} {:?}", elem_per_pe, per_pe_size);
-        let rmr_t: MemoryRegion<T> =
-            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
-        // let rmr = MemoryRegion::new(
-        //     per_pe_size * std::mem::size_of::<T>(),
-        //     team.lamellae.clone(),
-        //     AllocationType::Global,
-        // );
-        // println!("new array {:?}",rmr_t.as_ptr());
-        
+        // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
+        let rmr = MemoryRegion::new(
+            per_pe_size * std::mem::size_of::<T>(),
+            team.lamellae.clone(),
+            AllocationType::Global,
+        );
         unsafe {
-            // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
-            //     *elem = std::mem::zeroed();
-            // }
-            if std::mem::needs_drop::<T>() {
-                // If `T` needs to be dropped then we have to do this one item at a time, in
-                // case one of the intermediate drops does a panic.
-                // slice.iter_mut().for_each(write_zeroes);
-                panic!("need drop not yet supported");
-              } else {
-                // Otherwise we can be really fast and just fill everthing with zeros.
-                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
-                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
-              }
+            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
+                *elem = 0;
+            }
         }
-        let rmr = unsafe { rmr_t.to_base::<u8>() };
-        // println!("new array u8 {:?}",rmr.as_ptr());
 
         let data = Darc::try_new_with_drop(
             team.clone(),
@@ -251,30 +235,16 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         if remaining_elems > 0 {
             per_pe_size += 1
         }
-        let rmr_t: MemoryRegion<T> =
-            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
-        // let rmr = MemoryRegion::new(
-        //     per_pe_size * std::mem::size_of::<T>(),
-        //     team.lamellae.clone(),
-        //     AllocationType::Global,
-        // );
-        
+        let rmr = MemoryRegion::new(
+            per_pe_size * std::mem::size_of::<T>(),
+            team.lamellae.clone(),
+            AllocationType::Global,
+        );
         unsafe {
-             // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
-            //     *elem = std::mem::zeroed();
-            // }
-            if std::mem::needs_drop::<T>() {
-                // If `T` needs to be dropped then we have to do this one item at a time, in
-                // case one of the intermediate drops does a panic.
-                // slice.iter_mut().for_each(write_zeroes);
-                panic!("need drop not yet supported");
-              } else {
-                // Otherwise we can be really fast and just fill everthing with zeros.
-                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
-                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
-              }
+            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
+                *elem = 0;
+            }
         }
-        let rmr = unsafe { rmr_t.to_base::<u8>() };
 
         let data = Darc::try_new_with_drop(
             team.clone(),
@@ -457,7 +427,6 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     pub(crate) fn local_as_mut_ptr(&self) -> *mut T {
         let u8_ptr = unsafe { self.inner.local_as_mut_ptr() };
         // self.inner.data.mem_region.as_casted_mut_ptr::<T>().unwrap();
-        // println!("ptr: {:?} {:?}", u8_ptr, u8_ptr as *const T);
         u8_ptr as *mut T
     }
 
@@ -1842,7 +1811,6 @@ impl UnsafeArrayInner {
             self.data.mem_region.as_casted_mut_ptr::<u8>().expect(
                 "memory doesnt exist on this pe (this should not happen for arrays currently)",
             );
-        // println!("u8 ptr: {:?}", ptr);
         // let len = self.size;
         let my_pe = self.data.my_pe;
         let num_pes = self.data.num_pes;
diff --git a/src/darc.rs b/src/darc.rs
index b2614497..7c5c0183 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -912,15 +912,6 @@ impl<T> Darc<T> {
     }
 }
 
-fn calc_padding(addr: usize, align: usize) -> usize {
-    let rem = addr % align;
-    if rem == 0 {
-        0
-    } else {
-        align - rem
-    }
-}
-
 impl<T> Darc<T> {
     #[doc(alias = "Collective")]
     /// Constructs a new `Darc<T>` on the PEs specified by team.
@@ -976,34 +967,12 @@ impl<T> Darc<T> {
             AllocationType::Sub(team_rt.get_pes())
         };
 
-        //The DarcInner data structure
-        let mut size = std::mem::size_of::<DarcInner<T>>();
-
-        // Ref Cnt Array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        // total ref cnt array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let total_ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        // mode array
-        let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
-        let mode_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
-
-        //mode ref cnt array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let mode_ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        //mode_barrier array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let mode_barrier_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
+        let size = std::mem::size_of::<DarcInner<T>>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<usize>();
         // println!("creating new darc");
 
         team_rt.async_barrier().await;
@@ -1039,30 +1008,25 @@ impl<T> Darc<T> {
             weak_local_cnt: AtomicUsize::new(0),
             dist_cnt: AtomicUsize::new(0),
             total_dist_cnt: AtomicUsize::new(0),
-            // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
-            // total_ref_cnt_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            // mode_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            // mode_ref_cnt_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
-            // mode_barrier_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            ref_cnt_addr: addr + ref_cnt_offset,
-            total_ref_cnt_addr: addr + total_ref_cnt_offset,
-            mode_addr: addr + mode_offset,
-            mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
-            mode_barrier_addr: addr + mode_barrier_offset,
+            ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+            total_ref_cnt_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            mode_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            mode_ref_cnt_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+            mode_barrier_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
             barrier: barrier_ptr,
             // mode_barrier_rounds: num_rounds,
             am_counters: am_counters_ptr,
@@ -1118,33 +1082,12 @@ impl<T> Darc<T> {
             AllocationType::Sub(team_rt.get_pes())
         };
 
-        //The DarcInner data structure
-        let mut size = std::mem::size_of::<DarcInner<T>>();
-
-        // Ref Cnt Array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        // total ref cnt array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let total_ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        // mode array
-        let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
-        let mode_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
-
-        //mode ref cnt array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let mode_ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        //mode_barrier array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let mode_barrier_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+        let size = std::mem::size_of::<DarcInner<T>>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+            + team_rt.num_pes * std::mem::size_of::<usize>()
+            + team_rt.num_pes * std::mem::size_of::<usize>();
         // println!("creating new darc");
 
         team_rt.tasking_barrier();
@@ -1180,30 +1123,25 @@ impl<T> Darc<T> {
             weak_local_cnt: AtomicUsize::new(0),
             dist_cnt: AtomicUsize::new(0),
             total_dist_cnt: AtomicUsize::new(0),
-            // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
-            // total_ref_cnt_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            // mode_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            // mode_ref_cnt_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
-            // mode_barrier_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            ref_cnt_addr: addr + ref_cnt_offset,
-            total_ref_cnt_addr: addr + total_ref_cnt_offset,
-            mode_addr: addr + mode_offset,
-            mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
-            mode_barrier_addr: addr + mode_barrier_offset,
+            ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+            total_ref_cnt_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            mode_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            mode_ref_cnt_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+            mode_barrier_addr: addr
+                + std::mem::size_of::<DarcInner<T>>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<usize>()
+                + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+                + team_rt.num_pes * std::mem::size_of::<usize>(),
             barrier: barrier_ptr,
             // mode_barrier_rounds: num_rounds,
             am_counters: am_counters_ptr,
diff --git a/src/lamellae.rs b/src/lamellae.rs
index 2c7e249c..69ce6241 100755
--- a/src/lamellae.rs
+++ b/src/lamellae.rs
@@ -228,16 +228,14 @@ pub(crate) fn create_lamellae(backend: Backend) -> LamellaeBuilder {
         #[cfg(feature = "enable-rofi")]
         Backend::Rofi => {
             let provider = match std::env::var("LAMELLAR_ROFI_PROVIDER") {
-                Ok(p) => p,
-                Err(_) => "verbs".to_owned(),
-                // Ok(p) => match p.as_str() {
-                //     "verbs" => "verbs",
-                //     "tcp" => "tcp",
-                //     _ => "verbs",
-                // },
-                // Err(_) => "verbs",
+                Ok(p) => match p.as_str() {
+                    "verbs" => "verbs",
+                    "shm" => "shm",
+                    _ => "verbs",
+                },
+                Err(_) => "verbs",
             };
-            LamellaeBuilder::RofiBuilder(RofiBuilder::new(&provider))
+            LamellaeBuilder::RofiBuilder(RofiBuilder::new(provider))
         }
         Backend::Shmem => LamellaeBuilder::ShmemBuilder(ShmemBuilder::new()),
         Backend::Local => LamellaeBuilder::Local(Local::new()),
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index ac32e1f6..f86640d9 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -16,7 +16,7 @@ use std::sync::Arc;
 //use tracing::*;
 
 // const CMD_BUF_LEN: usize = 50000; // this is the number of slots for each PE
-// const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
+                                  // const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
 const CMD_BUFS_PER_PE: usize = 2;
 
 // lazy_static! {
@@ -62,35 +62,24 @@ enum Cmd {
 //#[tracing::instrument(skip_all)]
 fn calc_hash(addr: usize, len: usize) -> usize {
     //we split into a u64 slice and a u8 slice as u64 seems to compute faster.
-    let num_usizes = len / std::mem::size_of::<usize>();
-    //let u64_slice = unsafe { std::slice::from_raw_parts(addr as *const u64, num_u64s) };
-    let num_u8s = len % std::mem::size_of::<usize>();
+    let num_u64s = len / std::mem::size_of::<u64>();
+    let u64_slice = unsafe { std::slice::from_raw_parts(addr as *const u64, num_u64s) };
+    let num_u8s = len % std::mem::size_of::<u64>();
     let u8_slice = unsafe {
         std::slice::from_raw_parts(
-            (addr + num_usizes * std::mem::size_of::<usize>()) as *const u8,
+            (addr + num_u64s * std::mem::size_of::<u64>()) as *const u8,
             num_u8s,
         )
     };
-    ((0..num_usizes)
-        .map(|x| unsafe { Wrapping((addr as *const usize).offset(x as isize).read_unaligned()) })
+    (u64_slice
+        .iter()
+        .map(|x| Wrapping(*x as usize))
         .sum::<Wrapping<usize>>()
         + u8_slice
             .iter()
             .map(|x| Wrapping(*x as usize))
             .sum::<Wrapping<usize>>())
     .0
-
-    // let u8_slice = unsafe {
-    //     std::slice::from_raw_parts(
-    //         (addr) as *const u8,
-    //         num_u8s,
-    //     )
-    // };
-    // u8_slice
-    //     .iter()
-    //     .map(|x| Wrapping(*x as usize))
-    //     .sum::<Wrapping<usize>>()
-    //     .0
 }
 
 impl CmdMsg {
@@ -1076,13 +1065,10 @@ impl InnerCQ {
         }
         let ser_data = ser_data.unwrap();
         // if print{
-        // ser_data.print();
+        //     ser_data.print();
         // }
         self.get_serialized_data(src, cmd, &ser_data).await;
-        // println!(
-        //     "received data {:?}",
-        //     &ser_data.header_and_data_as_bytes()[0..10]
-        // );
+        // println!("received data {:?}",ser_data.header_and_data_as_bytes()[0..10]);
         self.recv_cnt.fetch_add(1, Ordering::SeqCst);
         // println!("received: {:?} {:?} cmd: {:?} {:?}",cmd.dsize,ser_data.len(),cmd,&ser_data.header_and_data_as_bytes()[0..20]);
         // SerializedData::RofiData(ser_data)
@@ -1093,7 +1079,7 @@ impl InnerCQ {
     async fn get_cmd_buf(&self, src: usize, cmd: CmdMsg) -> usize {
         let mut data = self
             .comm
-            .rt_alloc(cmd.dsize as usize, std::mem::align_of::<CmdMsg>());
+            .rt_alloc(cmd.dsize as usize, std::mem::align_of::<u8>());
         let mut timer = std::time::Instant::now();
         while data.is_err() && self.active.load(Ordering::SeqCst) != CmdQStatus::Panic as u8 {
             async_std::task::yield_now().await;
@@ -1101,7 +1087,7 @@ impl InnerCQ {
             self.send_alloc(cmd.dsize);
             data = self
                 .comm
-                .rt_alloc(cmd.dsize as usize, std::mem::align_of::<CmdMsg>());
+                .rt_alloc(cmd.dsize as usize, std::mem::align_of::<u8>());
             // println!("cq 874 data {:?}",data.is_ok());
             if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!("get cmd buf stuck waiting for alloc");
@@ -1342,8 +1328,7 @@ impl CommandQueue {
                 //     &data.header_and_data_as_bytes()[0..20]
                 // );
                 data.increment_cnt(); //or we could implement something like an into_raw here...
-                                      // println!("sending data {:?}", data.header_and_data_as_bytes());
-
+                                      // println!("sending data {:?}",data.header_and_data_as_bytes());
                 self.cq.send(data.relative_addr, data.len, dst, hash).await;
             }
             SerializedData::ShmemData(ref data) => {
@@ -1531,7 +1516,7 @@ impl CommandQueue {
 
     //#[tracing::instrument(skip_all)]
     pub fn mem_per_pe() -> usize {
-        (config().cmd_buf_len * config().cmd_buf_cnt + 4) * std::mem::size_of::<CmdMsg>()
+        (config().cmd_buf_len * config().cmd_buf_cnt  + 4) * std::mem::size_of::<CmdMsg>()
     }
 }
 
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index ff847f6c..ab2fc34c 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -94,10 +94,13 @@ impl RofiComm {
 
     //#[tracing::instrument(skip_all)]
     unsafe fn fill_buffer<R: Copy, T>(&self, dst_addr: &mut [T], val: R) {
-        let num_r = (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>();
-        let r_ptr = dst_addr.as_ptr() as *mut T as *mut R;
-        for i in 0..num_r {
-            r_ptr.offset(i as isize).write_unaligned(val);
+        // println!("{:?} {:?} {:?} {:?} {:?}",std::mem::size_of::<T>(),std::mem::size_of::<R>(),(dst_addr.len()*std::mem::size_of::<T>()),(dst_addr.len()*std::mem::size_of::<T>())/std::mem::size_of::<R>(),(dst_addr.len()*std::mem::size_of::<T>())%std::mem::size_of::<R>());
+        let bytes = std::slice::from_raw_parts_mut(
+            dst_addr.as_ptr() as *mut T as *mut R,
+            (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>(),
+        );
+        for elem in bytes.iter_mut() {
+            *elem = val;
         }
     }
     //#[tracing::instrument(skip_all)]
@@ -117,26 +120,24 @@ impl RofiComm {
         }
     }
     //#[tracing::instrument(skip_all)]
-    unsafe fn check_buffer_elems<R: std::cmp::PartialEq + std::fmt::Debug, T>(
+    unsafe fn check_buffer_elems<R: std::cmp::PartialEq, T>(
         &self,
         dst_addr: &mut [T],
         val: R,
     ) -> TxResult<()> {
-        let num_r = (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>();
-        let r_ptr = dst_addr.as_ptr() as *mut T as *mut R;
-
+        let bytes = std::slice::from_raw_parts_mut(
+            dst_addr.as_ptr() as *mut T as *mut R,
+            (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>(),
+        );
         let mut timer = std::time::Instant::now();
-        for i in 0..num_r - 2 {
-            while r_ptr.offset(i as isize).read_unaligned() == val
-                && r_ptr.offset(i as isize + 1).read_unaligned() == val
-            {
+        for i in 0..(bytes.len() as isize - 2) {
+            while bytes[i as usize] == val && bytes[i as usize + 1] == val {
                 if timer.elapsed().as_secs_f64() > 1.0 {
                     // println!(
-                    //     "{:?}/{:?}: {:?} {:?} {:?}",
+                    //     "{:?}: {:?} {:?} {:?}",
                     //     i,
-                    //     num_r,
-                    //     r_ptr.offset(i as isize).read_unaligned(),
-                    //     r_ptr.offset(i as isize + 1).read_unaligned(),
+                    //     bytes[i as usize],
+                    //     bytes[i as usize + 1],
                     //     val
                     // );
                     return Err(TxError::GetError);
@@ -144,10 +145,9 @@ impl RofiComm {
                 //hopefully magic number doesnt appear twice in a row
                 std::thread::yield_now();
             }
-            timer = std::time::Instant::now();
         }
         timer = std::time::Instant::now();
-        while r_ptr.offset(num_r as isize - 1).read_unaligned() == val {
+        while bytes[bytes.len() - 1] == val {
             if timer.elapsed().as_secs_f64() > 1.0 {
                 // println!("{:?}", bytes[bytes.len() - 1]);
                 return Err(TxError::GetError);
@@ -263,7 +263,6 @@ impl CommOps for RofiComm {
     }
     //#[tracing::instrument(skip_all)]
     fn rt_alloc(&self, size: usize, align: usize) -> AllocResult<usize> {
-        // println!("rt_alloc size {size} align {align}");
         // let size = size + size%8;
         let allocs = self.alloc.read();
         for alloc in allocs.iter() {
@@ -361,16 +360,12 @@ impl CommOps for RofiComm {
             // }
         } else {
             unsafe {
-                // println!(
-                //     "[{:?}]-({:?}) memcopy {:?} into {:x} src align {:?} dst align {:?}",
-                //     pe,
-                //     src_addr.as_ptr(),
-                //     src_addr.len(),
-                //     dst_addr,
-                //     src_addr.as_ptr().align_offset(std::mem::align_of::<T>()),
-                //     (dst_addr as *mut T).align_offset(std::mem::align_of::<T>()),
-                // );
-                std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
+                // println!("[{:?}]-({:?}) memcopy {:?} into {:x}",pe,src_addr.as_ptr(),src_addr.len(),dst_addr);
+                std::ptr::copy_nonoverlapping(
+                    src_addr.as_ptr(),
+                    dst_addr as *mut T,
+                    src_addr.len(),
+                );
             }
         }
         // req
@@ -399,7 +394,11 @@ impl CommOps for RofiComm {
         } else {
             unsafe {
                 // println!("[{:?}]-({:?}) memcopy {:?}",pe,src_addr.as_ptr());
-                std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
+                std::ptr::copy_nonoverlapping(
+                    src_addr.as_ptr(),
+                    dst_addr as *mut T,
+                    src_addr.len(),
+                );
             }
         }
         // req
@@ -432,7 +431,7 @@ impl CommOps for RofiComm {
         }
         // drop(lock);
         unsafe {
-            std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
+            std::ptr::copy_nonoverlapping(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
         }
         self.put_amt.fetch_add(
             src_addr.len() * (self.num_pes - 1) * std::mem::size_of::<T>(),
@@ -488,7 +487,11 @@ impl CommOps for RofiComm {
         } else {
             // println!("[{:?}]-{:?} {:?} {:?}",self.my_pe,src_addr as *const T,dst_addr.as_mut_ptr(),dst_addr.len());
             unsafe {
-                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
+                std::ptr::copy_nonoverlapping(
+                    src_addr as *const T,
+                    dst_addr.as_mut_ptr(),
+                    dst_addr.len(),
+                );
             }
         }
         // req
@@ -564,7 +567,11 @@ impl CommOps for RofiComm {
             }
         } else {
             unsafe {
-                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
+                std::ptr::copy_nonoverlapping(
+                    src_addr as *const T,
+                    dst_addr.as_mut_ptr(),
+                    dst_addr.len(),
+                );
             }
         }
     }
@@ -610,7 +617,11 @@ impl CommOps for RofiComm {
             // };
         } else {
             unsafe {
-                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
+                std::ptr::copy_nonoverlapping(
+                    src_addr as *const T,
+                    dst_addr.as_mut_ptr(),
+                    dst_addr.len(),
+                );
             }
         }
         // req
diff --git a/src/lamellar_alloc.rs b/src/lamellar_alloc.rs
index 8069382a..1a6471ab 100644
--- a/src/lamellar_alloc.rs
+++ b/src/lamellar_alloc.rs
@@ -362,15 +362,8 @@ impl LamellarAlloc for BTreeAlloc {
             //     a + padding,
             //     self.free_space.load(Ordering::SeqCst)
             // );
-            let new_addr = a + padding;
-            // let rem = new_addr % align;
-            // let rem_16 = new_addr % 16;
-            // println!(
-            //     "alloc addr {:x?} {:x?} {new_addr} {a} {padding} {rem} {align} {rem_16}",
-            //     a + padding,
-            //     new_addr,
-            // );
-            Some(new_addr)
+
+            Some(a + padding)
         } else {
             None
         };
@@ -397,7 +390,7 @@ impl LamellarAlloc for BTreeAlloc {
     fn free(&self, addr: usize) -> Result<(), usize> {
         let &(ref lock, ref _cvar) = &*self.allocated_addrs;
         let mut allocated_addrs = lock.lock();
-        // println!("trying to free: {:x?} {:?}", addr, addr);
+
         if let Some((size, padding)) = allocated_addrs.remove(&addr) {
             // println!("allocated_addrs: {:?}", allocated_addrs);
             let full_size = size + padding;
diff --git a/src/memregion.rs b/src/memregion.rs
index dfdbfa6e..1d44a1ab 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -730,9 +730,8 @@ impl<T: Dist> MemoryRegion<T> {
         alloc: AllocationType,
     ) -> Result<MemoryRegion<T>, anyhow::Error> {
         // println!(
-        //     "creating new lamellar memory region size: {:?} align: {:?}",
-        //     size * std::mem::size_of::<T>(),
-        //     std::mem::align_of::<T>()
+        //     "creating new lamellar memory region {:?}",
+        //     size * std::mem::size_of::<T>()
         // );
         let mut mode = Mode::Shared;
         let addr = if size > 0 {
@@ -792,25 +791,23 @@ impl<T: Dist> MemoryRegion<T> {
 
     #[allow(dead_code)]
     //#[tracing::instrument(skip_all)]
-    pub(crate) unsafe fn to_base<B: Dist>(mut self) -> MemoryRegion<B> {
+    pub(crate) unsafe fn to_base<B: Dist>(self) -> MemoryRegion<B> {
         //this is allowed as we consume the old object..
         assert_eq!(
             self.num_bytes % std::mem::size_of::<B>(),
             0,
             "Error converting memregion to new base, does not align"
         );
-        // MemoryRegion {
-        //     addr: self.addr, //TODO: out of memory...
-        //     pe: self.pe,
-        //     size: self.num_bytes / std::mem::size_of::<B>(),
-        //     num_bytes: self.num_bytes,
-        //     backend: self.backend,
-        //     rdma: self.rdma.clone(),
-        //     mode: self.mode,
-        //     phantom: PhantomData,
-        // }
-        self.size = self.num_bytes / std::mem::size_of::<B>();
-        std::mem::transmute(self) //we do this because other wise self gets dropped and frees the underlying data (we could also set addr to 0 in self)
+        MemoryRegion {
+            addr: self.addr, //TODO: out of memory...
+            pe: self.pe,
+            size: self.num_bytes / std::mem::size_of::<B>(),
+            num_bytes: self.num_bytes,
+            backend: self.backend,
+            rdma: self.rdma.clone(),
+            mode: self.mode,
+            phantom: PhantomData,
+        }
     }
 
     // }
diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index dcdedff2..e248e83d 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -355,8 +355,11 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
         team: &std::pin::Pin<Arc<LamellarTeamRT>>,
         lamellae: Arc<Lamellae>,
     ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
-        let mr_t: MemoryRegion<T> = MemoryRegion::try_new(size, lamellae, AllocationType::Local)?;
-        let mr = unsafe { mr_t.to_base::<u8>() };
+        let mr = MemoryRegion::try_new(
+            size * std::mem::size_of::<T>(),
+            lamellae,
+            AllocationType::Local,
+        )?;
         let pe = mr.pe;
 
         let id = ID_COUNTER.fetch_add(1, Ordering::Relaxed);
diff --git a/src/memregion/shared.rs b/src/memregion/shared.rs
index 39d975a7..3390588b 100644
--- a/src/memregion/shared.rs
+++ b/src/memregion/shared.rs
@@ -99,11 +99,17 @@ impl<T: Dist> SharedMemoryRegion<T> {
         alloc: AllocationType,
     ) -> Result<SharedMemoryRegion<T>, anyhow::Error> {
         // println!("creating new shared mem region {:?} {:?}",size,alloc);
-        let mr_t: MemoryRegion<T> = MemoryRegion::try_new(size, team.lamellae.clone(), alloc)?;
-        let mr = unsafe { mr_t.to_base::<u8>() };
         Ok(SharedMemoryRegion {
-            mr: Darc::try_new(team.clone(), mr, crate::darc::DarcMode::Darc)
-                .expect("memregions can only be created on a member of the team"),
+            mr: Darc::try_new(
+                team.clone(),
+                MemoryRegion::try_new(
+                    size * std::mem::size_of::<T>(),
+                    team.lamellae.clone(),
+                    alloc,
+                )?,
+                crate::darc::DarcMode::Darc,
+            )
+            .expect("memregions can only be created on a member of the team"),
             sub_region_offset: 0,
             sub_region_size: size,
             phantom: PhantomData,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 7774654b..82d95f24 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -168,7 +168,6 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
-        // println!("am ptr {:p} ", &am);
         let am_future = async move {
             // let start_tid = thread::current().id();
             num_ams.fetch_add(1, Ordering::Relaxed);

From 6bdb375099f2fc94a3756592c58296e673e3bc2b Mon Sep 17 00:00:00 2001
From: Ryan Friese <ryan.friese@pnnl.gov>
Date: Wed, 5 Jun 2024 15:18:06 -0700
Subject: [PATCH 040/116] Update Cargo.toml

switch from crates to git for rofi-sys temporarily
---
 Cargo.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 678c5091..51696abf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,8 +13,8 @@ categories = ["asynchronous","concurrency", "network-programming","science"]
 
 [dependencies]
 lamellar-impl = { version = "0.6.0", path = "impl" }
-rofisys = { version ="0.3", optional = true }
-#rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
+#rofisys = { version ="0.3", optional = true }
+rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
 inventory = "0.3" 
 serde = { version = "1.0.147", features = ["derive"] }
 serde_bytes = "0.11.7"

From 994739665e15fdb50547c75a8e87b1272e0a5b8e Mon Sep 17 00:00:00 2001
From: Conghao Liu <cliu115@hawk.iit.edu>
Date: Tue, 11 Jun 2024 09:10:18 -0500
Subject: [PATCH 041/116] fix batch_load when input is empty

---
 src/array/operations.rs        | 21 +++++++++++++++++++++
 src/array/unsafe/operations.rs |  3 +++
 2 files changed, 24 insertions(+)

diff --git a/src/array/operations.rs b/src/array/operations.rs
index d595c763..37a1cc35 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -453,6 +453,9 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a [T] {
         // println!("slice as op input");
         let len = self.len();
         let mut iters = vec![];
+        if len == 0 {
+            return (iters,len)
+        }
         let num = if len < 1000 {
             1
         } else {
@@ -514,6 +517,9 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a mut [T] {
         let len = self.len();
         let mut iters = vec![];
 
+        if len == 0 {
+            return (iters, len);
+        }
         let num = if len < 1000 {
             1
         } else {
@@ -593,6 +599,9 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
         // println!("vec as op input");
         let len = self.len();
+        if len == 0 {
+            return (vec![], len)
+        }
         let num = if len < 1000 {
             1
         } else {
@@ -717,6 +726,9 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
         // println!("LocalLockLocalData as_op_input {:?}", self.deref());
         let len = self.len();
         let mut iters = vec![];
+        if len == 0 {
+            return (iters, len);
+        }
         let my_pe = self.array.my_pe();
         if let Some(_start_index) = self.array.array.inner.start_index_for_pe(my_pe) {
             let num = if len < 1000 {
@@ -763,6 +775,9 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
         // println!("GlobalLockLocalData as_op_input");
         let len = self.len();
         let mut iters = vec![];
+        if len == 0 {
+            return (iters, len);
+        }
         let my_pe = self.array.my_pe();
         if let Some(_start_index) = self.array.array.inner.start_index_for_pe(my_pe) {
             let num = if len < 1000 {
@@ -843,6 +858,9 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for &GenericAtomicLocalData<T> {
         let local_data = self.clone();
         let len = local_data.len();
         let mut iters = vec![];
+        if len == 0 {
+            return (iters, len);
+        }
         let my_pe = self.array.my_pe();
         if let Some(_start_index) = self.array.array.inner.start_index_for_pe(my_pe) {
             let num = if len < 1000 {
@@ -894,6 +912,9 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for &NativeAtomicLocalData<T> {
         let local_data = self.clone();
         let len = local_data.len();
         let mut iters = vec![];
+        if len == 0 {
+            return (iters, len);
+        }
         let my_pe = self.array.my_pe();
         if let Some(_start_index) = self.array.array.inner.start_index_for_pe(my_pe) {
             let num = if len < 1000 {
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index df0a319b..515cfaa1 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -409,6 +409,9 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         } else {
             VecDeque::new()
         };
+        if res.len() == 0 {
+            return ArrayFetchBatchOpHandle::new(byte_array, res, 0);
+        }
         ArrayFetchBatchOpHandle::new(byte_array, res, std::cmp::max(i_len, v_len))
     }
 

From bfac98dea39225bc32c5811492d3a85b1ee089cb Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Sat, 25 May 2024 15:59:03 -0700
Subject: [PATCH 042/116] use approriate APIs to handle debug version asserts
 for low level communication

---
 examples/hello_world/hello_world_am.rs     |   3 +
 examples/team_examples/custom_team_arch.rs |   2 +-
 lamellar_run.sh                            |   2 +-
 run_examples.sh                            |   2 +
 src/array.rs                               |   4 +-
 src/array/unsafe.rs                        |  66 ++++++---
 src/darc.rs                                | 162 ++++++++++++++-------
 src/lamellae.rs                            |  16 +-
 src/lamellae/command_queues.rs             |  43 ++++--
 src/lamellae/rofi/rofi_comm.rs             |  79 +++++-----
 src/lamellar_alloc.rs                      |  13 +-
 src/memregion.rs                           |  29 ++--
 src/memregion/one_sided.rs                 |   7 +-
 src/memregion/shared.rs                    |  14 +-
 src/scheduler.rs                           |   1 +
 15 files changed, 276 insertions(+), 167 deletions(-)

diff --git a/examples/hello_world/hello_world_am.rs b/examples/hello_world/hello_world_am.rs
index 1e3ea685..301a5ef6 100644
--- a/examples/hello_world/hello_world_am.rs
+++ b/examples/hello_world/hello_world_am.rs
@@ -37,3 +37,6 @@ fn main() {
     //wait for the request to complete
     world.block_on(request);
 } //when world drops there is an implicit world.barrier() that occurs
+
+
+    
\ No newline at end of file
diff --git a/examples/team_examples/custom_team_arch.rs b/examples/team_examples/custom_team_arch.rs
index 6879b99e..72d3c00f 100644
--- a/examples/team_examples/custom_team_arch.rs
+++ b/examples/team_examples/custom_team_arch.rs
@@ -79,7 +79,7 @@ impl LamellarArch for BlockStridedArch {
         let block = parent_pe / self.block_size;
         let start_block = self.start_pe / self.block_size;
         let remainder = parent_pe % self.block_size;
-        if (block - start_block) % self.stride == 0
+        if block >= start_block && (block - start_block) % self.stride == 0
             && self.start_pe <= *parent_pe
             && *parent_pe <= self.end_pe
         {
diff --git a/lamellar_run.sh b/lamellar_run.sh
index 1bb295f0..5f3af138 100755
--- a/lamellar_run.sh
+++ b/lamellar_run.sh
@@ -34,7 +34,7 @@ for pe in $(seq 0 $ENDPE); do
     echo "more threads ${E_CORE} than cores ${NPROC} "
     exit
   fi
-  LAMELLAE_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out & 
+  LAMELLAR_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out.txt & 
   S_CORE=$(($E_CORE ))
   E_CORE=$(($S_CORE + $THREADS))
 done
diff --git a/run_examples.sh b/run_examples.sh
index 2147f82f..bfdeded9 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -66,6 +66,8 @@ root=$PWD
 #  cd ..
 # done
 
+cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
+
 local_results_dir=async_backends
 results_dir=${output_dir}/rofiverbs_lamellae/${local_results_dir}
 ### test using rofi verbs lamellae
diff --git a/src/array.rs b/src/array.rs
index 35276813..6406d68a 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -198,11 +198,13 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
+// impl Dist for bool {}
 // lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
 // lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
-// impl Dist for bool {}
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
 lamellar_impl::generate_reductions_for_type_rt!(false, u128);
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index f9753806..bb182938 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -157,17 +157,33 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         if remaining_elems > 0 {
             per_pe_size += 1
         }
-        // println!("new unsafe array {:?} {:?} {:?}", elem_per_pe, num_elems_local, per_pe_size);
-        let rmr = MemoryRegion::new(
-            per_pe_size * std::mem::size_of::<T>(),
-            team.lamellae.clone(),
-            AllocationType::Global,
-        );
+        // println!("new unsafe array {:?} {:?}", elem_per_pe, per_pe_size);
+        let rmr_t: MemoryRegion<T> =
+            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
+        // let rmr = MemoryRegion::new(
+        //     per_pe_size * std::mem::size_of::<T>(),
+        //     team.lamellae.clone(),
+        //     AllocationType::Global,
+        // );
+        // println!("new array {:?}",rmr_t.as_ptr());
+        
         unsafe {
-            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
-                *elem = 0;
-            }
+            // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
+            //     *elem = std::mem::zeroed();
+            // }
+            if std::mem::needs_drop::<T>() {
+                // If `T` needs to be dropped then we have to do this one item at a time, in
+                // case one of the intermediate drops does a panic.
+                // slice.iter_mut().for_each(write_zeroes);
+                panic!("need drop not yet supported");
+              } else {
+                // Otherwise we can be really fast and just fill everthing with zeros.
+                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
+                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
+              }
         }
+        let rmr = unsafe { rmr_t.to_base::<u8>() };
+        // println!("new array u8 {:?}",rmr.as_ptr());
 
         let data = Darc::try_new_with_drop(
             team.clone(),
@@ -235,16 +251,30 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         if remaining_elems > 0 {
             per_pe_size += 1
         }
-        let rmr = MemoryRegion::new(
-            per_pe_size * std::mem::size_of::<T>(),
-            team.lamellae.clone(),
-            AllocationType::Global,
-        );
+        let rmr_t: MemoryRegion<T> =
+            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
+        // let rmr = MemoryRegion::new(
+        //     per_pe_size * std::mem::size_of::<T>(),
+        //     team.lamellae.clone(),
+        //     AllocationType::Global,
+        // );
+        
         unsafe {
-            for elem in rmr.as_mut_slice().expect("data should exist on pe") {
-                *elem = 0;
-            }
+             // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
+            //     *elem = std::mem::zeroed();
+            // }
+            if std::mem::needs_drop::<T>() {
+                // If `T` needs to be dropped then we have to do this one item at a time, in
+                // case one of the intermediate drops does a panic.
+                // slice.iter_mut().for_each(write_zeroes);
+                panic!("need drop not yet supported");
+              } else {
+                // Otherwise we can be really fast and just fill everthing with zeros.
+                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
+                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
+              }
         }
+        let rmr = unsafe { rmr_t.to_base::<u8>() };
 
         let data = Darc::try_new_with_drop(
             team.clone(),
@@ -427,6 +457,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     pub(crate) fn local_as_mut_ptr(&self) -> *mut T {
         let u8_ptr = unsafe { self.inner.local_as_mut_ptr() };
         // self.inner.data.mem_region.as_casted_mut_ptr::<T>().unwrap();
+        // println!("ptr: {:?} {:?}", u8_ptr, u8_ptr as *const T);
         u8_ptr as *mut T
     }
 
@@ -1811,6 +1842,7 @@ impl UnsafeArrayInner {
             self.data.mem_region.as_casted_mut_ptr::<u8>().expect(
                 "memory doesnt exist on this pe (this should not happen for arrays currently)",
             );
+        // println!("u8 ptr: {:?}", ptr);
         // let len = self.size;
         let my_pe = self.data.my_pe;
         let num_pes = self.data.num_pes;
diff --git a/src/darc.rs b/src/darc.rs
index 7c5c0183..b2614497 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -912,6 +912,15 @@ impl<T> Darc<T> {
     }
 }
 
+fn calc_padding(addr: usize, align: usize) -> usize {
+    let rem = addr % align;
+    if rem == 0 {
+        0
+    } else {
+        align - rem
+    }
+}
+
 impl<T> Darc<T> {
     #[doc(alias = "Collective")]
     /// Constructs a new `Darc<T>` on the PEs specified by team.
@@ -967,12 +976,34 @@ impl<T> Darc<T> {
             AllocationType::Sub(team_rt.get_pes())
         };
 
-        let size = std::mem::size_of::<DarcInner<T>>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>();
+        //The DarcInner data structure
+        let mut size = std::mem::size_of::<DarcInner<T>>();
+
+        // Ref Cnt Array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // total ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let total_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // mode array
+        let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
+        let mode_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
+
+        //mode ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        //mode_barrier array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_barrier_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
         // println!("creating new darc");
 
         team_rt.async_barrier().await;
@@ -1008,25 +1039,30 @@ impl<T> Darc<T> {
             weak_local_cnt: AtomicUsize::new(0),
             dist_cnt: AtomicUsize::new(0),
             total_dist_cnt: AtomicUsize::new(0),
-            ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
-            total_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
-            mode_barrier_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+            // total_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+            // mode_barrier_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            ref_cnt_addr: addr + ref_cnt_offset,
+            total_ref_cnt_addr: addr + total_ref_cnt_offset,
+            mode_addr: addr + mode_offset,
+            mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
+            mode_barrier_addr: addr + mode_barrier_offset,
             barrier: barrier_ptr,
             // mode_barrier_rounds: num_rounds,
             am_counters: am_counters_ptr,
@@ -1082,12 +1118,33 @@ impl<T> Darc<T> {
             AllocationType::Sub(team_rt.get_pes())
         };
 
-        let size = std::mem::size_of::<DarcInner<T>>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-            + team_rt.num_pes * std::mem::size_of::<usize>()
-            + team_rt.num_pes * std::mem::size_of::<usize>();
+        //The DarcInner data structure
+        let mut size = std::mem::size_of::<DarcInner<T>>();
+
+        // Ref Cnt Array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // total ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let total_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        // mode array
+        let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
+        let mode_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
+
+        //mode ref cnt array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_ref_cnt_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+        //mode_barrier array
+        let padding = calc_padding(size, std::mem::align_of::<usize>());
+        let mode_barrier_offset = size + padding;
+        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
         // println!("creating new darc");
 
         team_rt.tasking_barrier();
@@ -1123,25 +1180,30 @@ impl<T> Darc<T> {
             weak_local_cnt: AtomicUsize::new(0),
             dist_cnt: AtomicUsize::new(0),
             total_dist_cnt: AtomicUsize::new(0),
-            ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
-            total_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
-            mode_ref_cnt_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
-            mode_barrier_addr: addr
-                + std::mem::size_of::<DarcInner<T>>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<usize>()
-                + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-                + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+            // total_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            // mode_ref_cnt_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+            // mode_barrier_addr: addr
+            //     + std::mem::size_of::<DarcInner<T>>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>()
+            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+            ref_cnt_addr: addr + ref_cnt_offset,
+            total_ref_cnt_addr: addr + total_ref_cnt_offset,
+            mode_addr: addr + mode_offset,
+            mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
+            mode_barrier_addr: addr + mode_barrier_offset,
             barrier: barrier_ptr,
             // mode_barrier_rounds: num_rounds,
             am_counters: am_counters_ptr,
diff --git a/src/lamellae.rs b/src/lamellae.rs
index 69ce6241..2c7e249c 100755
--- a/src/lamellae.rs
+++ b/src/lamellae.rs
@@ -228,14 +228,16 @@ pub(crate) fn create_lamellae(backend: Backend) -> LamellaeBuilder {
         #[cfg(feature = "enable-rofi")]
         Backend::Rofi => {
             let provider = match std::env::var("LAMELLAR_ROFI_PROVIDER") {
-                Ok(p) => match p.as_str() {
-                    "verbs" => "verbs",
-                    "shm" => "shm",
-                    _ => "verbs",
-                },
-                Err(_) => "verbs",
+                Ok(p) => p,
+                Err(_) => "verbs".to_owned(),
+                // Ok(p) => match p.as_str() {
+                //     "verbs" => "verbs",
+                //     "tcp" => "tcp",
+                //     _ => "verbs",
+                // },
+                // Err(_) => "verbs",
             };
-            LamellaeBuilder::RofiBuilder(RofiBuilder::new(provider))
+            LamellaeBuilder::RofiBuilder(RofiBuilder::new(&provider))
         }
         Backend::Shmem => LamellaeBuilder::ShmemBuilder(ShmemBuilder::new()),
         Backend::Local => LamellaeBuilder::Local(Local::new()),
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index f86640d9..ac32e1f6 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -16,7 +16,7 @@ use std::sync::Arc;
 //use tracing::*;
 
 // const CMD_BUF_LEN: usize = 50000; // this is the number of slots for each PE
-                                  // const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
+// const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
 const CMD_BUFS_PER_PE: usize = 2;
 
 // lazy_static! {
@@ -62,24 +62,35 @@ enum Cmd {
 //#[tracing::instrument(skip_all)]
 fn calc_hash(addr: usize, len: usize) -> usize {
     //we split into a u64 slice and a u8 slice as u64 seems to compute faster.
-    let num_u64s = len / std::mem::size_of::<u64>();
-    let u64_slice = unsafe { std::slice::from_raw_parts(addr as *const u64, num_u64s) };
-    let num_u8s = len % std::mem::size_of::<u64>();
+    let num_usizes = len / std::mem::size_of::<usize>();
+    //let u64_slice = unsafe { std::slice::from_raw_parts(addr as *const u64, num_u64s) };
+    let num_u8s = len % std::mem::size_of::<usize>();
     let u8_slice = unsafe {
         std::slice::from_raw_parts(
-            (addr + num_u64s * std::mem::size_of::<u64>()) as *const u8,
+            (addr + num_usizes * std::mem::size_of::<usize>()) as *const u8,
             num_u8s,
         )
     };
-    (u64_slice
-        .iter()
-        .map(|x| Wrapping(*x as usize))
+    ((0..num_usizes)
+        .map(|x| unsafe { Wrapping((addr as *const usize).offset(x as isize).read_unaligned()) })
         .sum::<Wrapping<usize>>()
         + u8_slice
             .iter()
             .map(|x| Wrapping(*x as usize))
             .sum::<Wrapping<usize>>())
     .0
+
+    // let u8_slice = unsafe {
+    //     std::slice::from_raw_parts(
+    //         (addr) as *const u8,
+    //         num_u8s,
+    //     )
+    // };
+    // u8_slice
+    //     .iter()
+    //     .map(|x| Wrapping(*x as usize))
+    //     .sum::<Wrapping<usize>>()
+    //     .0
 }
 
 impl CmdMsg {
@@ -1065,10 +1076,13 @@ impl InnerCQ {
         }
         let ser_data = ser_data.unwrap();
         // if print{
-        //     ser_data.print();
+        // ser_data.print();
         // }
         self.get_serialized_data(src, cmd, &ser_data).await;
-        // println!("received data {:?}",ser_data.header_and_data_as_bytes()[0..10]);
+        // println!(
+        //     "received data {:?}",
+        //     &ser_data.header_and_data_as_bytes()[0..10]
+        // );
         self.recv_cnt.fetch_add(1, Ordering::SeqCst);
         // println!("received: {:?} {:?} cmd: {:?} {:?}",cmd.dsize,ser_data.len(),cmd,&ser_data.header_and_data_as_bytes()[0..20]);
         // SerializedData::RofiData(ser_data)
@@ -1079,7 +1093,7 @@ impl InnerCQ {
     async fn get_cmd_buf(&self, src: usize, cmd: CmdMsg) -> usize {
         let mut data = self
             .comm
-            .rt_alloc(cmd.dsize as usize, std::mem::align_of::<u8>());
+            .rt_alloc(cmd.dsize as usize, std::mem::align_of::<CmdMsg>());
         let mut timer = std::time::Instant::now();
         while data.is_err() && self.active.load(Ordering::SeqCst) != CmdQStatus::Panic as u8 {
             async_std::task::yield_now().await;
@@ -1087,7 +1101,7 @@ impl InnerCQ {
             self.send_alloc(cmd.dsize);
             data = self
                 .comm
-                .rt_alloc(cmd.dsize as usize, std::mem::align_of::<u8>());
+                .rt_alloc(cmd.dsize as usize, std::mem::align_of::<CmdMsg>());
             // println!("cq 874 data {:?}",data.is_ok());
             if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!("get cmd buf stuck waiting for alloc");
@@ -1328,7 +1342,8 @@ impl CommandQueue {
                 //     &data.header_and_data_as_bytes()[0..20]
                 // );
                 data.increment_cnt(); //or we could implement something like an into_raw here...
-                                      // println!("sending data {:?}",data.header_and_data_as_bytes());
+                                      // println!("sending data {:?}", data.header_and_data_as_bytes());
+
                 self.cq.send(data.relative_addr, data.len, dst, hash).await;
             }
             SerializedData::ShmemData(ref data) => {
@@ -1516,7 +1531,7 @@ impl CommandQueue {
 
     //#[tracing::instrument(skip_all)]
     pub fn mem_per_pe() -> usize {
-        (config().cmd_buf_len * config().cmd_buf_cnt  + 4) * std::mem::size_of::<CmdMsg>()
+        (config().cmd_buf_len * config().cmd_buf_cnt + 4) * std::mem::size_of::<CmdMsg>()
     }
 }
 
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index ab2fc34c..ff847f6c 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -94,13 +94,10 @@ impl RofiComm {
 
     //#[tracing::instrument(skip_all)]
     unsafe fn fill_buffer<R: Copy, T>(&self, dst_addr: &mut [T], val: R) {
-        // println!("{:?} {:?} {:?} {:?} {:?}",std::mem::size_of::<T>(),std::mem::size_of::<R>(),(dst_addr.len()*std::mem::size_of::<T>()),(dst_addr.len()*std::mem::size_of::<T>())/std::mem::size_of::<R>(),(dst_addr.len()*std::mem::size_of::<T>())%std::mem::size_of::<R>());
-        let bytes = std::slice::from_raw_parts_mut(
-            dst_addr.as_ptr() as *mut T as *mut R,
-            (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>(),
-        );
-        for elem in bytes.iter_mut() {
-            *elem = val;
+        let num_r = (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>();
+        let r_ptr = dst_addr.as_ptr() as *mut T as *mut R;
+        for i in 0..num_r {
+            r_ptr.offset(i as isize).write_unaligned(val);
         }
     }
     //#[tracing::instrument(skip_all)]
@@ -120,24 +117,26 @@ impl RofiComm {
         }
     }
     //#[tracing::instrument(skip_all)]
-    unsafe fn check_buffer_elems<R: std::cmp::PartialEq, T>(
+    unsafe fn check_buffer_elems<R: std::cmp::PartialEq + std::fmt::Debug, T>(
         &self,
         dst_addr: &mut [T],
         val: R,
     ) -> TxResult<()> {
-        let bytes = std::slice::from_raw_parts_mut(
-            dst_addr.as_ptr() as *mut T as *mut R,
-            (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>(),
-        );
+        let num_r = (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>();
+        let r_ptr = dst_addr.as_ptr() as *mut T as *mut R;
+
         let mut timer = std::time::Instant::now();
-        for i in 0..(bytes.len() as isize - 2) {
-            while bytes[i as usize] == val && bytes[i as usize + 1] == val {
+        for i in 0..num_r - 2 {
+            while r_ptr.offset(i as isize).read_unaligned() == val
+                && r_ptr.offset(i as isize + 1).read_unaligned() == val
+            {
                 if timer.elapsed().as_secs_f64() > 1.0 {
                     // println!(
-                    //     "{:?}: {:?} {:?} {:?}",
+                    //     "{:?}/{:?}: {:?} {:?} {:?}",
                     //     i,
-                    //     bytes[i as usize],
-                    //     bytes[i as usize + 1],
+                    //     num_r,
+                    //     r_ptr.offset(i as isize).read_unaligned(),
+                    //     r_ptr.offset(i as isize + 1).read_unaligned(),
                     //     val
                     // );
                     return Err(TxError::GetError);
@@ -145,9 +144,10 @@ impl RofiComm {
                 //hopefully magic number doesnt appear twice in a row
                 std::thread::yield_now();
             }
+            timer = std::time::Instant::now();
         }
         timer = std::time::Instant::now();
-        while bytes[bytes.len() - 1] == val {
+        while r_ptr.offset(num_r as isize - 1).read_unaligned() == val {
             if timer.elapsed().as_secs_f64() > 1.0 {
                 // println!("{:?}", bytes[bytes.len() - 1]);
                 return Err(TxError::GetError);
@@ -263,6 +263,7 @@ impl CommOps for RofiComm {
     }
     //#[tracing::instrument(skip_all)]
     fn rt_alloc(&self, size: usize, align: usize) -> AllocResult<usize> {
+        // println!("rt_alloc size {size} align {align}");
         // let size = size + size%8;
         let allocs = self.alloc.read();
         for alloc in allocs.iter() {
@@ -360,12 +361,16 @@ impl CommOps for RofiComm {
             // }
         } else {
             unsafe {
-                // println!("[{:?}]-({:?}) memcopy {:?} into {:x}",pe,src_addr.as_ptr(),src_addr.len(),dst_addr);
-                std::ptr::copy_nonoverlapping(
-                    src_addr.as_ptr(),
-                    dst_addr as *mut T,
-                    src_addr.len(),
-                );
+                // println!(
+                //     "[{:?}]-({:?}) memcopy {:?} into {:x} src align {:?} dst align {:?}",
+                //     pe,
+                //     src_addr.as_ptr(),
+                //     src_addr.len(),
+                //     dst_addr,
+                //     src_addr.as_ptr().align_offset(std::mem::align_of::<T>()),
+                //     (dst_addr as *mut T).align_offset(std::mem::align_of::<T>()),
+                // );
+                std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
             }
         }
         // req
@@ -394,11 +399,7 @@ impl CommOps for RofiComm {
         } else {
             unsafe {
                 // println!("[{:?}]-({:?}) memcopy {:?}",pe,src_addr.as_ptr());
-                std::ptr::copy_nonoverlapping(
-                    src_addr.as_ptr(),
-                    dst_addr as *mut T,
-                    src_addr.len(),
-                );
+                std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
             }
         }
         // req
@@ -431,7 +432,7 @@ impl CommOps for RofiComm {
         }
         // drop(lock);
         unsafe {
-            std::ptr::copy_nonoverlapping(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
+            std::ptr::copy(src_addr.as_ptr(), dst_addr as *mut T, src_addr.len());
         }
         self.put_amt.fetch_add(
             src_addr.len() * (self.num_pes - 1) * std::mem::size_of::<T>(),
@@ -487,11 +488,7 @@ impl CommOps for RofiComm {
         } else {
             // println!("[{:?}]-{:?} {:?} {:?}",self.my_pe,src_addr as *const T,dst_addr.as_mut_ptr(),dst_addr.len());
             unsafe {
-                std::ptr::copy_nonoverlapping(
-                    src_addr as *const T,
-                    dst_addr.as_mut_ptr(),
-                    dst_addr.len(),
-                );
+                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
             }
         }
         // req
@@ -567,11 +564,7 @@ impl CommOps for RofiComm {
             }
         } else {
             unsafe {
-                std::ptr::copy_nonoverlapping(
-                    src_addr as *const T,
-                    dst_addr.as_mut_ptr(),
-                    dst_addr.len(),
-                );
+                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
             }
         }
     }
@@ -617,11 +610,7 @@ impl CommOps for RofiComm {
             // };
         } else {
             unsafe {
-                std::ptr::copy_nonoverlapping(
-                    src_addr as *const T,
-                    dst_addr.as_mut_ptr(),
-                    dst_addr.len(),
-                );
+                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
             }
         }
         // req
diff --git a/src/lamellar_alloc.rs b/src/lamellar_alloc.rs
index 1a6471ab..8069382a 100644
--- a/src/lamellar_alloc.rs
+++ b/src/lamellar_alloc.rs
@@ -362,8 +362,15 @@ impl LamellarAlloc for BTreeAlloc {
             //     a + padding,
             //     self.free_space.load(Ordering::SeqCst)
             // );
-
-            Some(a + padding)
+            let new_addr = a + padding;
+            // let rem = new_addr % align;
+            // let rem_16 = new_addr % 16;
+            // println!(
+            //     "alloc addr {:x?} {:x?} {new_addr} {a} {padding} {rem} {align} {rem_16}",
+            //     a + padding,
+            //     new_addr,
+            // );
+            Some(new_addr)
         } else {
             None
         };
@@ -390,7 +397,7 @@ impl LamellarAlloc for BTreeAlloc {
     fn free(&self, addr: usize) -> Result<(), usize> {
         let &(ref lock, ref _cvar) = &*self.allocated_addrs;
         let mut allocated_addrs = lock.lock();
-
+        // println!("trying to free: {:x?} {:?}", addr, addr);
         if let Some((size, padding)) = allocated_addrs.remove(&addr) {
             // println!("allocated_addrs: {:?}", allocated_addrs);
             let full_size = size + padding;
diff --git a/src/memregion.rs b/src/memregion.rs
index 1d44a1ab..dfdbfa6e 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -730,8 +730,9 @@ impl<T: Dist> MemoryRegion<T> {
         alloc: AllocationType,
     ) -> Result<MemoryRegion<T>, anyhow::Error> {
         // println!(
-        //     "creating new lamellar memory region {:?}",
-        //     size * std::mem::size_of::<T>()
+        //     "creating new lamellar memory region size: {:?} align: {:?}",
+        //     size * std::mem::size_of::<T>(),
+        //     std::mem::align_of::<T>()
         // );
         let mut mode = Mode::Shared;
         let addr = if size > 0 {
@@ -791,23 +792,25 @@ impl<T: Dist> MemoryRegion<T> {
 
     #[allow(dead_code)]
     //#[tracing::instrument(skip_all)]
-    pub(crate) unsafe fn to_base<B: Dist>(self) -> MemoryRegion<B> {
+    pub(crate) unsafe fn to_base<B: Dist>(mut self) -> MemoryRegion<B> {
         //this is allowed as we consume the old object..
         assert_eq!(
             self.num_bytes % std::mem::size_of::<B>(),
             0,
             "Error converting memregion to new base, does not align"
         );
-        MemoryRegion {
-            addr: self.addr, //TODO: out of memory...
-            pe: self.pe,
-            size: self.num_bytes / std::mem::size_of::<B>(),
-            num_bytes: self.num_bytes,
-            backend: self.backend,
-            rdma: self.rdma.clone(),
-            mode: self.mode,
-            phantom: PhantomData,
-        }
+        // MemoryRegion {
+        //     addr: self.addr, //TODO: out of memory...
+        //     pe: self.pe,
+        //     size: self.num_bytes / std::mem::size_of::<B>(),
+        //     num_bytes: self.num_bytes,
+        //     backend: self.backend,
+        //     rdma: self.rdma.clone(),
+        //     mode: self.mode,
+        //     phantom: PhantomData,
+        // }
+        self.size = self.num_bytes / std::mem::size_of::<B>();
+        std::mem::transmute(self) //we do this because other wise self gets dropped and frees the underlying data (we could also set addr to 0 in self)
     }
 
     // }
diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index e248e83d..dcdedff2 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -355,11 +355,8 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
         team: &std::pin::Pin<Arc<LamellarTeamRT>>,
         lamellae: Arc<Lamellae>,
     ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
-        let mr = MemoryRegion::try_new(
-            size * std::mem::size_of::<T>(),
-            lamellae,
-            AllocationType::Local,
-        )?;
+        let mr_t: MemoryRegion<T> = MemoryRegion::try_new(size, lamellae, AllocationType::Local)?;
+        let mr = unsafe { mr_t.to_base::<u8>() };
         let pe = mr.pe;
 
         let id = ID_COUNTER.fetch_add(1, Ordering::Relaxed);
diff --git a/src/memregion/shared.rs b/src/memregion/shared.rs
index 3390588b..39d975a7 100644
--- a/src/memregion/shared.rs
+++ b/src/memregion/shared.rs
@@ -99,17 +99,11 @@ impl<T: Dist> SharedMemoryRegion<T> {
         alloc: AllocationType,
     ) -> Result<SharedMemoryRegion<T>, anyhow::Error> {
         // println!("creating new shared mem region {:?} {:?}",size,alloc);
+        let mr_t: MemoryRegion<T> = MemoryRegion::try_new(size, team.lamellae.clone(), alloc)?;
+        let mr = unsafe { mr_t.to_base::<u8>() };
         Ok(SharedMemoryRegion {
-            mr: Darc::try_new(
-                team.clone(),
-                MemoryRegion::try_new(
-                    size * std::mem::size_of::<T>(),
-                    team.lamellae.clone(),
-                    alloc,
-                )?,
-                crate::darc::DarcMode::Darc,
-            )
-            .expect("memregions can only be created on a member of the team"),
+            mr: Darc::try_new(team.clone(), mr, crate::darc::DarcMode::Darc)
+                .expect("memregions can only be created on a member of the team"),
             sub_region_offset: 0,
             sub_region_size: size,
             phantom: PhantomData,
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 82d95f24..7774654b 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -168,6 +168,7 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
+        // println!("am ptr {:p} ", &am);
         let am_future = async move {
             // let start_tid = thread::current().id();
             num_ams.fetch_add(1, Ordering::Relaxed);

From 7375a84961d835bba3feb4c770c40a5044fdbb6c Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 15 Jul 2024 16:45:43 -0700
Subject: [PATCH 043/116] documentation + cleanup warnings

---
 Cargo.toml                                    |  11 +-
 examples/bandwidths/get_bw.rs                 |   1 -
 examples/hello_world/hello_world_array.rs     |   4 +-
 examples/kernels/am_gemm.rs                   |   2 +-
 examples/misc/ping_pong.rs                    |   6 +-
 impl/src/gen_am_group.rs                      |   6 +-
 impl/src/lib.rs                               |  22 +-
 run_examples.sh                               |  94 +-------
 src/active_messaging.rs                       |  75 +++++-
 src/active_messaging/handle.rs                |   4 +-
 src/active_messaging/prelude.rs               |   4 +-
 src/array.rs                                  | 138 +++++------
 src/array/atomic.rs                           | 177 +++++++++++++-
 src/array/atomic/rdma.rs                      |  68 +++---
 src/array/generic_atomic.rs                   |  43 ++--
 src/array/generic_atomic/iteration.rs         |   4 +-
 src/array/global_lock_atomic.rs               | 213 ++++++++++++++++-
 src/array/global_lock_atomic/iteration.rs     |   4 +-
 src/array/handle.rs                           |   2 +
 src/array/iterator/distributed_iterator.rs    |  14 +-
 .../distributed_iterator/consumer/collect.rs  |   6 +-
 .../distributed_iterator/consumer/count.rs    |   6 +-
 .../distributed_iterator/consumer/for_each.rs |   8 +-
 .../distributed_iterator/consumer/reduce.rs   |   6 +-
 .../distributed_iterator/consumer/sum.rs      |   6 +-
 src/array/iterator/local_iterator.rs          |  13 +-
 .../local_iterator/consumer/collect.rs        |   4 +-
 .../iterator/local_iterator/consumer/count.rs |   6 +-
 .../local_iterator/consumer/for_each.rs       |   8 +-
 .../local_iterator/consumer/reduce.rs         |   6 +-
 .../iterator/local_iterator/consumer/sum.rs   |   4 +-
 src/array/iterator/mod.rs                     |  24 +-
 src/array/iterator/one_sided_iterator.rs      |  48 +++-
 src/array/local_lock_atomic.rs                | 224 +++++++++++++++++-
 src/array/local_lock_atomic/iteration.rs      |   4 +-
 src/array/local_lock_atomic/local_chunks.rs   |  78 ++++++
 src/array/native_atomic.rs                    |  78 +++---
 src/array/native_atomic/iteration.rs          |   4 +-
 src/array/operations.rs                       |  11 +-
 src/array/operations/bitwise.rs               |   2 +-
 src/array/operations/handle.rs                |   8 +-
 src/array/operations/shift.rs                 |   2 +-
 src/array/prelude.rs                          |   6 +-
 src/array/read_only.rs                        | 132 ++++++++++-
 src/array/read_only/local_chunks.rs           |  19 ++
 src/array/unsafe.rs                           |  50 ++--
 src/array/unsafe/local_chunks.rs              |  73 ++++--
 src/array/unsafe/operations.rs                |   5 +-
 src/darc.rs                                   |  61 ++++-
 src/darc/global_rw_darc.rs                    |   8 +-
 src/darc/local_rw_darc.rs                     |  18 +-
 src/darc/prelude.rs                           |   6 +-
 src/env_var.rs                                |  33 ++-
 src/lamellae.rs                               |  25 +-
 src/lamellae/comm.rs                          |   6 +-
 src/lamellae/command_queues.rs                |  24 +-
 src/lamellae/local_lamellae.rs                |  35 ++-
 src/lamellae/rofi/rofi_comm.rs                |  94 ++++----
 src/lamellae/rofi_lamellae.rs                 |  54 ++---
 src/lamellae/shmem/shmem_comm.rs              |   8 +-
 src/lamellae/shmem_lamellae.rs                |  52 ++--
 src/lamellar_alloc.rs                         |   1 +
 src/lamellar_arch.rs                          |  24 +-
 src/lamellar_request.rs                       |  10 +-
 src/lamellar_task_group.rs                    |  14 +-
 src/lamellar_team.rs                          |  52 ++++
 src/lamellar_world.rs                         |  20 +-
 src/lib.rs                                    |  54 +++--
 src/memregion.rs                              |  19 +-
 src/memregion/one_sided.rs                    |   2 +-
 src/memregion/prelude.rs                      |   2 +-
 src/scheduler.rs                              |   5 +-
 src/scheduler/async_std_executor.rs           |   6 +-
 src/scheduler/tokio_executor.rs               |   6 +-
 src/scheduler/work_stealing.rs                |   6 +-
 src/scheduler/work_stealing2.rs               | 136 +++++------
 src/scheduler/work_stealing3.rs               |  10 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |   2 +-
 78 files changed, 1773 insertions(+), 753 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 51696abf..5e715c1e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,8 @@ categories = ["asynchronous","concurrency", "network-programming","science"]
 [dependencies]
 lamellar-impl = { version = "0.6.0", path = "impl" }
 #rofisys = { version ="0.3", optional = true }
-rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
+#rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
+rofisys = { path = "../rofi-sys-junction", optional = true}
 inventory = "0.3" 
 serde = { version = "1.0.147", features = ["derive"] }
 serde_bytes = "0.11.7"
@@ -64,7 +65,9 @@ tracing-subscriber = "0.3"
 [workspace]
 members = ["impl"]
 
-
+# Set the settings for build scripts and proc-macros.
+[profile.dev.build-override]
+opt-level = 3
 
 #features are strictly additive.... can't have mutual exclusitivity
 [features]
@@ -440,6 +443,10 @@ path="examples/misc/lamellar_env.rs"
 name="ping_pong"
 path="examples/misc/ping_pong.rs"
 
+[[example]]
+name="dist_hashmap"
+path="examples/misc/dist_hashmap.rs"
+
 
 ##------------ Darc examples ------------------##
 [[example]]
diff --git a/examples/bandwidths/get_bw.rs b/examples/bandwidths/get_bw.rs
index c14f2144..5b2c5185 100644
--- a/examples/bandwidths/get_bw.rs
+++ b/examples/bandwidths/get_bw.rs
@@ -82,7 +82,6 @@ fn main() {
         }
         let cur_t = timer.elapsed().as_secs_f64();
         world.barrier();
-        s = Instant::now();
         // let cur_t = timer.elapsed().as_secs_f64();
         let cur: f64 = world.MB_sent();
         let mbs_c = world.MB_sent();
diff --git a/examples/hello_world/hello_world_array.rs b/examples/hello_world/hello_world_array.rs
index 0241029b..98428761 100644
--- a/examples/hello_world/hello_world_array.rs
+++ b/examples/hello_world/hello_world_array.rs
@@ -4,8 +4,6 @@
 /// --------------------------------------------------------------------
 use lamellar::array::prelude::*;
 
-use lamellar::RemoteMemoryRegion;
-
 fn main() {
     let timer = std::time::Instant::now();
     let world = lamellar::LamellarWorldBuilder::new().build();
@@ -27,7 +25,7 @@ fn main() {
     println!("array_time: {:?}", array_time);
 
     let timer = std::time::Instant::now();
-    let one_sided = world.alloc_one_sided_mem_region::<usize>(local_length);
+    // let _one_sided = world.alloc_one_sided_mem_region::<usize>(local_length);
     let one_sided_time = timer.elapsed();
     println!("one_sided_time: {:?}", one_sided_time);
 
diff --git a/examples/kernels/am_gemm.rs b/examples/kernels/am_gemm.rs
index 84eececb..f9150a35 100644
--- a/examples/kernels/am_gemm.rs
+++ b/examples/kernels/am_gemm.rs
@@ -10,7 +10,7 @@
 /// matrices use row-wise distribution (i.e. all elements of a row are local to a pe,
 /// conversely this means elements of a column are distributed across pes)
 ///----------------------------------------------------------------------------------
-use futures_util::{Future,future};
+use futures_util::future;
 use lamellar::active_messaging::prelude::*;
 use lamellar::memregion::prelude::*;
 use lazy_static::lazy_static;
diff --git a/examples/misc/ping_pong.rs b/examples/misc/ping_pong.rs
index 6f072490..34b70892 100644
--- a/examples/misc/ping_pong.rs
+++ b/examples/misc/ping_pong.rs
@@ -37,7 +37,7 @@ struct RecvAm {
 impl LamellarAm for RecvAm {
     async fn exec(self) {
         unsafe {
-            let mut cnt = 0;
+            let cnt = 0;
 
             let start = self.remote_pe * self.buffer_size;
             let end = start + self.buffer_size;
@@ -371,7 +371,7 @@ fn main() {
     let mut res_am_buffers = Vec::new();
     let mut send_am_buffers = Vec::new();
 
-    for i in 0..num_pes {
+    for _i in 0..num_pes {
         let mut pe_buffer = VecDeque::new();
         let idx_buffers = IdxAmBuffer {
             idx_send_buffer: index_send_buffers.clone(),
@@ -403,7 +403,7 @@ fn main() {
     }
     let mut reqs = vec![];
     // if my_pe == 0 {
-    for thread in 0..1 {
+    for _thread in 0..1 {
         //world.num_threads_per_pe() {
         reqs.push(world.exec_am_local(MyAm {
             indices: indices.clone(),
diff --git a/impl/src/gen_am_group.rs b/impl/src/gen_am_group.rs
index c8f6f89f..1ac5472d 100644
--- a/impl/src/gen_am_group.rs
+++ b/impl/src/gen_am_group.rs
@@ -305,7 +305,7 @@ fn impl_am_group_user(
     };
 
     // quote! {
-    //     #[doc(hidden)]
+    //     //#[doc(hidden)]
     //     pub struct #am_group_name_user #impl_generics #where_clause{
     //         team: std::sync::Arc<#lamellar::LamellarTeam>,
     //         batch_cnt: usize,
@@ -317,7 +317,7 @@ fn impl_am_group_user(
     quote! {
         impl #am_user_impl_generics #am_group_name_user #am_user_ty_generics #am_user_where_clause{
             pub fn new(team: std::sync::Arc<#lamellar::LamellarTeam>) -> Self {
-                let num_per_batch = #lamellar::config().batch_op_size;
+                let num_per_batch = #lamellar::config().am_group_batch_size;
                 // match std::env::var("LAMELLAR_OP_BATCH") {
                 //     Ok(n) => n.parse::<usize>().unwrap(),
                 //     Err(_) => 10000,
@@ -398,7 +398,7 @@ fn generate_am_group_user_struct(
     let (_impl_generics, ty_generics, _where_clause) = generics.split_for_impl();
 
     quote! {
-        #[doc(hidden)]
+        //#[doc(hidden)]
         #vis struct #am_group_name_user #am_user_impl_generics #am_user_where_clause{
             team: std::sync::Arc<lamellar::LamellarTeam>,
             batch_cnt: usize,
diff --git a/impl/src/lib.rs b/impl/src/lib.rs
index 8f6728d3..0c7b55f5 100644
--- a/impl/src/lib.rs
+++ b/impl/src/lib.rs
@@ -277,7 +277,7 @@ pub fn AmGroupData(args: TokenStream, input: TokenStream) -> TokenStream {
     derive_am_data(input, args, quote! {__lamellar}, false, true, false)
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[allow(non_snake_case)]
 #[proc_macro_error]
 #[proc_macro_attribute]
@@ -287,7 +287,7 @@ pub fn AmDataRT(args: TokenStream, input: TokenStream) -> TokenStream {
     derive_am_data(input, args, quote! {crate}, false, false, true)
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[allow(non_snake_case)]
 #[proc_macro_error]
 #[proc_macro_attribute]
@@ -477,7 +477,7 @@ pub fn am(args: TokenStream, input: TokenStream) -> TokenStream {
     parse_am(args, input, false, false, true)
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro_attribute]
 pub fn am_group(args: TokenStream, input: TokenStream) -> TokenStream {
@@ -525,21 +525,21 @@ pub fn local_am(args: TokenStream, input: TokenStream) -> TokenStream {
     parse_am(args, input, true, false, false)
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro_attribute]
 pub fn rt_am(args: TokenStream, input: TokenStream) -> TokenStream {
     parse_am(args, input, false, true, false)
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro_attribute]
 pub fn rt_am_local(args: TokenStream, input: TokenStream) -> TokenStream {
     parse_am(args, input, true, true, false)
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro_derive(Dist)]
 pub fn derive_dist(input: TokenStream) -> TokenStream {
@@ -571,7 +571,7 @@ pub fn register_reduction(item: TokenStream) -> TokenStream {
 //     array_reduce::__generate_reductions_for_type(item)
 // }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
 pub fn generate_reductions_for_type_rt(item: TokenStream) -> TokenStream {
@@ -599,14 +599,14 @@ pub fn generate_reductions_for_type_rt(item: TokenStream) -> TokenStream {
 //     array_ops::__generate_ops_for_type(item)
 // }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
 pub fn generate_ops_for_type_rt(item: TokenStream) -> TokenStream {
     array_ops::__generate_ops_for_type_rt(item)
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[proc_macro_error]
 #[proc_macro]
 pub fn generate_ops_for_bool_rt(_item: TokenStream) -> TokenStream {
@@ -876,7 +876,7 @@ impl Parse for AmGroups {
 /// [2,2] on all PEs
 /// ```
 /// ### Static Members
-/// In the above code, the `ExampleAm` stuct contains a member that is a [crate::darc::Darc](Darc) (Distributed Arc).
+/// In the above code, the `ExampleAm` stuct contains a member that is a `Darc` (Distributed Arc).
 /// In order to properly calculate distributed reference counts Darcs implements specialized Serialize and Deserialize operations.
 /// While, the cost to any single serialization/deserialization operation is small, doing this for every active message containing
 /// a Darc can become expensive.
@@ -885,7 +885,7 @@ impl Parse for AmGroups {
 /// that every Active Message in the group is using a reference to the same Darc. In this case, we simply would only need
 /// to serialize the Darc once for each PE it gets sent to.
 ///
-/// This can be accomplished by using the [AmData] attribute macro with the `static` keyword passed in as an argument as illustrated below:
+/// This can be accomplished by using the [macro@AmData] attribute macro with the `static` keyword passed in as an argument as illustrated below:
 /// ```
 /// use lamellar::active_messaging::prelude::*;
 /// use lamellar::darc::prelude::*;
diff --git a/run_examples.sh b/run_examples.sh
index bfdeded9..881b440f 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -1,126 +1,46 @@
 #!/bin/bash
 
 
-target_dir=/home/scratch/$USER
-output_dir=/home/scratch/$USER
+# target_dir=/home/scratch/$USER
 target_dir=$PWD/target
+output_dir=/home/scratch/$USER
+
 root=$PWD
 . $root/../junction-prep.rc
 
-
-## test using local lamellae 
-# mkdir -p local_lamellae
-# cd local_lamellae
-# for toolchain in stable ; do #nightly; do
-#  features=""
-#  if [ "${toolchain}" = "nightly" ]; then
-#    features="--features nightly"
-#  fi
-# #  cargo clean
-#  cargo +$toolchain build --release ${features} --examples
-#  mkdir -p ${toolchain}
-#  cd ${toolchain}
-#  for dir in `ls $root/examples`; do
-#    mkdir -p $dir 
-#    cd $dir
-#      for test in `ls $root/examples/$dir`; do
-#        test=`basename $test .rs`
-#        LAMELLAR_THREADS=19 srun -N 1 --partition=all --time 0:5:00 $root/target/release/examples/$test > ${test}.out 2>&1  &
-#      done
-#    cd ..
-#  done
-#  cd ..
-#  wait
-# done
-
-
-### test using rofi shm lamellae
-# mkdir -p shmem_lamellae
-# cd shmem_lamellae
-# for toolchain in stable; do
-#  features=""
-# #  if [ "${toolchain}" = "nightly" ]; then
-# #    features="--features nightly"
-# #  fi
-# #  cargo clean
-# #  cargo +$toolchain build --release --examples
-#  mkdir -p ${toolchain}
-#  cd ${toolchain}
-#  for mode in debug release; do
-#   mkdir -p $mode
-#   cd ${mode}
-#   for dir in `ls $root/examples`; do
-#     mkdir -p $dir
-#     cd $dir
-#       for test in `ls $root/examples/$dir`; do
-#         test=`basename $test .rs`
-#         LAMELLAR_MEM_SIZE=$((5 * 1024 * 1024 * 1024)) srun -n 1 -N 1 -A lamellar --partition=datavortex --time 0:5:00 --mpi=pmi2 $root/lamellar_run.sh -N=1 -T=23 $root/target/${mode}/examples/$test |& tee ${test}_n1.out &
-#         LAMELLAR_MEM_SIZE=$((5 * 1024 * 1024 * 1024)) srun -n 1 -N 1 -A lamellar --partition=datavortex --time 0:5:00 --mpi=pmi2 $root/lamellar_run.sh -N=2  -T=11 $root/target/${mode}/examples/$test |& tee ${test}_n2.out &
-#         LAMELLAR_MEM_SIZE=$((5 * 1024 * 1024 * 1024)) srun -n 1 -N 1 -A lamellar --partition=datavortex --time 0:5:00 --mpi=pmi2 $root/lamellar_run.sh -N=8 -T=2 $root/target/${mode}/examples/$test |& tee ${test}_n8.out &
-#       done
-#     cd ..
-#   done
-#   cd ..
-#   wait
-#  done
-#  cd ..
-# done
-
-cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
-
 local_results_dir=async_backends
 results_dir=${output_dir}/rofiverbs_lamellae/${local_results_dir}
 ### test using rofi verbs lamellae
 rm -r ${results_dir}
 
 rm -r rofiverbs_lamellae
-# mkdir -p rofiverbs_lamellae
 mkdir -p ${results_dir}
 ln -s ${output_dir}/rofiverbs_lamellae rofiverbs_lamellae
 
+
+cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
+
+
 cd rofiverbs_lamellae/${local_results_dir}
 for toolchain in stable; do #nightly; do
   features=""
   if [ "${toolchain}" = "nightly" ]; then
     features="--features nightly"
   fi
-  # cargo clean
-
 
-  # cargo +$toolchain build --release --features enable-rofi  --examples
   mkdir -p ${toolchain}
   cd ${toolchain}
   for mode in release ; do
-    # cargo +$toolchain build --$mode --features enable-rofi  --examples
     mkdir -p $mode    
     cd ${mode}
 
     for dir in `ls $root/examples`; do
-    # for dir in kernels; do
-      # if [ $dir == "array_examples" ]; then
       mkdir -p $dir
       cd $dir
-
-        # for test in `ls $root/examples/$dir`; do
-        #   test=`basename $test .rs`
-        #   echo "performing ${test}"
-        #   LAMELLAE_BACKEND="rofi" LAMELLAR_ROFI_PROVIDER="verbs" LAMELLAR_THREADS=63 srun --cpus-per-task=64 --cpu-bind=ldoms,v -N 2 --time 0:5:00 --mpi=pmi2 $root/target/release/examples/$test > ${test}_n2.out 2>&1 & 
-        # done
         sbatch --exclude=j004,j005,j036 --cpus-per-task=64 -N 2 --time 0:120:00 $root/batch_runner.sh $root $dir $mode 64 2 $target_dir
         if [ $dir != "bandwidths" ]; then
           sbatch --exclude=j004,j005,j036 --cpus-per-task=64 -N 8 --time 0:120:00 $root/batch_runner.sh $root $dir $mode 64 8 $target_dir
           sbatch --exclude=j004,j005,j036 --cpus-per-task=32 -N 16 -n 32 --time 0:240:00 $root/batch_runner.sh $root $dir $mode 32 32 $target_dir
-                
-        #   for test in `ls $root/examples/$dir`; do
-        #     test=`basename $test .rs`
-        #     echo "performing ${test}"
-        #     LAMELLAE_BACKEND="rofi" LAMELLAR_ROFI_PROVIDER="verbs" LAMELLAR_THREADS=63 srun --cpus-per-task=64 --cpu-bind=ldoms,v -N 8 --time 0:5:00 --mpi=pmi2 $root/target/release/examples/$test > ${test}_n8.out 2>&1 &
-        #   done
-        #   for test in `ls $root/examples/$dir`; do
-        #     test=`basename $test .rs`
-        #     echo "performing ${test}"
-        #     LAMELLAE_BACKEND="rofi" LAMELLAR_ROFI_PROVIDER="verbs" LAMELLAR_THREADS=31 srun --cpus-per-task=32 --cpu-bind=ldoms,v -n 32 -N 16  --time 0:10:00 --mpi=pmi2 $root/target/release/examples/$test > ${test}_n32.out 2>&1 &
-        #   done
         fi
       cd ..
       sleep 2
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index ca4dbc2f..8e44bbab 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -648,12 +648,13 @@ use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
-#[doc(hidden)]
+// //#[doc(hidden)]
+/// The prelude for the active messaging module
 pub mod prelude;
 
 pub(crate) mod registered_active_message;
 use registered_active_message::RegisteredActiveMessages;
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use registered_active_message::RegisteredAm;
 
 pub(crate) mod batching;
@@ -696,7 +697,7 @@ pub use lamellar_impl::AmData;
 ///
 pub use lamellar_impl::AmLocalData;
 
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use lamellar_impl::AmGroupData;
 
 /// This macro is used to associate an implemenation of [LamellarAM] for type that has used the [AmData] attribute macro
@@ -900,8 +901,8 @@ pub(crate) enum Cmd {
 
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, Default)]
 pub(crate) struct Msg {
-    pub src: u16,
-    pub cmd: Cmd,
+    pub(crate) src: u16,
+    pub(crate) cmd: Cmd,
 }
 
 #[derive(serde::Serialize, serde::Deserialize, Debug)]
@@ -943,8 +944,13 @@ impl AMCounters {
 
 /// The interface for launching, executing, and managing Lamellar Active Messages .
 pub trait ActiveMessaging {
+    /// The handle type for single PE active messages
     type SinglePeAmHandle<R: AmDist>;
+
+    /// The handle type for multi PE active messages
     type MultiAmHandle<R: AmDist>;
+
+    /// The handle type for local active messages
     type LocalAmHandle<L>;
     #[doc(alias("One-sided", "onesided"))]
     /// launch and execute an active message on every PE (including originating PE).
@@ -1116,8 +1122,46 @@ pub trait ActiveMessaging {
     ///```
     fn wait_all(&self);
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// blocks calling task until all remote tasks (e.g. active mesages, array operations)
+    /// initiated by the calling PE have completed.
+    /// Intended to be used within an async context.
+    ///
+    /// # One-sided Operation
+    /// this is not a distributed synchronization primitive (i.e. it has no knowledge of a Remote PEs tasks), the calling thread will only wait for tasks
+    /// to finish that were initiated by the calling PE itself
+    ///
+    /// # Examples
+    ///```
+    /// # use lamellar::active_messaging::prelude::*;
+    /// #
+    /// # #[lamellar::AmData(Debug,Clone)]
+    /// # struct Am{
+    /// # // can contain anything that impls Sync, Send  
+    /// #     val: usize,
+    /// # }
+    ///
+    /// # #[lamellar::am]
+    /// # impl LamellarAM for Am{
+    /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
+    /// #         //do some remote computation
+    /// #          println!("hello from PE{}",self.val);
+    /// #         lamellar::current_pe //return the executing pe
+    /// #     }
+    /// # }
+    /// #
+    /// # let world = lamellar::LamellarWorldBuilder::new().build();
+    /// let world_clone = world.clone();
+    /// world.block_on(async move {
+    ///     world_clone.exec_am_all(Am{val: world_clone.my_pe()});
+    ///     world_clone.await_all().await; //block until the previous am has finished
+    /// });
+    ///```
+    fn await_all(&self) -> impl Future<Output = ()> + Send;
+
     #[doc(alias = "Collective")]
-    /// Global synchronization method which blocks calling thread until all PEs in the barrier group (e.g. World, Team, Array) have entered
+    /// Global synchronization method which blocks the calling thread until all PEs in the barrier group (e.g. World, Team, Array) have entered
+    /// Generally this is intended to be called from the main thread, if a barrier is needed within an active message or async context please see [async_barrier](Self::async_barrier)
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the ActiveMessaging object to enter the barrier, otherwise deadlock will occur
@@ -1132,6 +1176,25 @@ pub trait ActiveMessaging {
     ///```
     fn barrier(&self);
 
+    #[doc(alias = "Collective")]
+    /// EXPERIMENTAL: Global synchronization method which blocks the calling task until all PEs in the barrier group (e.g. World, Team, Array) have entered.
+    /// This function allows for calling barrier in an async context without blocking the worker thread.
+    /// Care should be taken when using this function to avoid deadlocks,as it is easy to mismatch barrier calls accross threads and PEs.
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the ActiveMessaging object to enter the barrier, otherwise deadlock will occur
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::active_messaging::prelude::*;
+    ///
+    /// let world = lamellar::LamellarWorldBuilder::new().build();
+    /// let world_clone = world.clone();
+    /// world.block_on(async move {
+    ///     //do some work
+    ///     world_clone.async_barrier().await; //block until all PEs have entered the barrier
+    /// });
+    ///```
     fn async_barrier(&self) -> impl Future<Output = ()> + Send;
 
     #[doc(alias("One-sided", "onesided"))]
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index 5468c652..55991064 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -64,6 +64,7 @@ impl LamellarRequestAddResult for AmHandleInner {
     }
 }
 
+/// A handle to an active messaging request that executes on a singe PE
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
 pub struct AmHandle<T> {
@@ -171,6 +172,7 @@ impl<T: AmDist> Future for AmHandle<T> {
     }
 }
 
+/// A handle to an active messaging request that executes on the local (originating) PE
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
 pub struct LocalAmHandle<T> {
@@ -282,7 +284,7 @@ pub(crate) struct MultiAmHandleInner {
     pub(crate) user_handle: AtomicU8, //we can use this flag to optimize what happens when the request returns
 }
 
-#[doc(hidden)]
+/// A handle to an active messaging request that executes on multiple PEs, returned from a call to [exec_am_all][crate::ActiveMessaging::exec_am_all]
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
 pub struct MultiAmHandle<T> {
diff --git a/src/active_messaging/prelude.rs b/src/active_messaging/prelude.rs
index daaf60d9..09fdffe1 100644
--- a/src/active_messaging/prelude.rs
+++ b/src/active_messaging/prelude.rs
@@ -1,4 +1,4 @@
-#[doc(hidden)]
+//#[doc(hidden)]
 // pub use crate::active_messaging::{
 //     registered_active_message::RegisteredAm, DarcSerde, LamellarActiveMessage, LamellarResultSerde,
 //     LamellarReturn, LamellarSerde, RemoteActiveMessage, Serde,
@@ -14,7 +14,7 @@ pub use crate::async_trait;
 pub use crate::inventory;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
diff --git a/src/array.rs b/src/array.rs
index 6406d68a..06c8ff30 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -43,7 +43,7 @@
 //! - `collect` and `collect_async` provide functionality analogous to the [collect](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.collect) method for Rust iterators
 //! - We also provided access directly to the underlying local data of an array using functions (and container types) that preserve the safety guarantees of a given array type
 //!     -`local_data`, `read_local_data`, `write_local_data`, etc. convert to slices and other data types.
-//!     - Consequently, these functions can be used to create valid inputs for batched operations,  see [OpInput](crate::array::OpInput) for details.
+//!     - Consequently, these functions can be used to create valid inputs for batched operations,  see [OpInput] for details.
 //! ```
 //! use lamellar::array::prelude::*;
 //!
@@ -88,31 +88,32 @@ use std::sync::Arc;
 /// This macro automatically derives various LamellarArray "Op" traits for user defined types
 ///
 /// The following "Op" traits are automatically implemented:
-/// - [AccessOps][crate::array::operations::AccessOps]
-/// - [ReadOnlyOps][crate::array::operations::ReadOnlyOps]
+/// - [AccessOps]
+/// - [ReadOnlyOps]
 ///
 /// Additionally, it is possible to pass any of the following as a list to [ArrayOps] to derive the associated traits
-/// - `Arithmetic` -- [ArithmeticOps][crate::array::operations::ArithmeticOps]
+/// - `Arithmetic` -- [ArithmeticOps]
 ///     - requires [AddAssign][std::ops::AddAssign], [SubAssign][std::ops::SubAssign], [MulAssign][std::ops::MulAssign], [DivAssign][std::ops::DivAssign], [RemAssign][std::ops::RemAssign] to be implemented on your data type
-/// - `Bitwise` -- [BitWiseOps][crate::array::operations::BitWiseOps]
+/// - `Bitwise` -- [BitWiseOps]
 ///     - requires [BitAndAssign][std::ops::BitAndAssign], [BitOrAssign][std::ops::BitOrAssign], [BitXorAssign][std::ops::BitXorAssign] to be implemented on your data type
-/// - `CompEx` -- [CompareExchangeOps][crate::array::operations::CompareExchangeOps]
-///     - requires [PartialEq][std::cmp::PartialEq], [PartialOrd][std::cmp::PartialOrd] to be implemented on your data type
-/// - `CompExEps` -- [CompareExchangeEpsilonOps][crate::array::operations::CompareExchangeEpsilonOps]
-///     - requires [PartialEq][std::cmp::PartialEq], [PartialOrd][std::cmp::PartialOrd] to be implemented on your data type
-/// - `Shift` -- [ShiftOps][crate::array::operations::ShiftOps]
+/// - `CompEx` -- [CompareExchangeOps]
+///     - requires [PartialEq], [PartialOrd] to be implemented on your data type
+/// - `CompExEps` -- [CompareExchangeEpsilonOps]
+///     - requires [PartialEq], [PartialOrd] to be implemented on your data type
+/// - `Shift` -- [ShiftOps]
 ///     - requires [ShlAssign][std::ops::ShlAssign], [ShrAssign][std::ops::ShrAssign] to be implemented on you data type
 ///
 /// Alternatively, if you plan to derive all the above traits you can simply supply `All` as the single argument to [ArrayOps]
 pub use lamellar_impl::ArrayOps;
 
-use crate::memregion::RemoteMemoryRegion;
+// //#[doc(hidden)]
 
-#[doc(hidden)]
+/// The prelude contains all the traits and macros that are required to use the array types
 pub mod prelude;
 
 pub(crate) mod r#unsafe;
 pub use r#unsafe::{
+    local_chunks::{UnsafeLocalChunks, UnsafeLocalChunksMut},
     operations::{
         multi_val_multi_idx_ops, multi_val_single_idx_ops, single_val_multi_idx_ops,
         BatchReturnType,
@@ -120,23 +121,10 @@ pub use r#unsafe::{
     UnsafeArray, UnsafeByteArray, UnsafeByteArrayWeak,
 };
 pub(crate) mod read_only;
-pub use read_only::{
-    ReadOnlyArray,
-    /*ReadOnlyArrayOpBuf, ReadOnlyArrayMultiMultiOps, ReadOnlyArrayMultiSingleOps,*/
-    ReadOnlyByteArray, ReadOnlyByteArrayWeak,
-};
-
-// pub(crate) mod local_only;
-// pub use local_only::LocalOnlyArray;
+pub use read_only::{ReadOnlyArray, ReadOnlyByteArray, ReadOnlyByteArrayWeak, ReadOnlyLocalChunks};
 
 pub(crate) mod atomic;
-pub use atomic::{
-    // operations::{AtomicArrayOp, AtomicArrayOpBuf},
-    AtomicArray,
-    AtomicByteArray, //AtomicOps
-    AtomicByteArrayWeak,
-    AtomicLocalData,
-};
+pub use atomic::{AtomicArray, AtomicByteArray, AtomicByteArrayWeak, AtomicLocalData};
 
 pub(crate) mod generic_atomic;
 pub use generic_atomic::{
@@ -150,22 +138,23 @@ pub use native_atomic::{
 
 pub(crate) mod local_lock_atomic;
 pub use local_lock_atomic::{
-    LocalLockArray, LocalLockByteArray, LocalLockByteArrayWeak, LocalLockLocalData,
-    LocalLockMutLocalData,
+    LocalLockArray, LocalLockByteArray, LocalLockByteArrayWeak, LocalLockLocalChunks,
+    LocalLockLocalChunksMut, LocalLockLocalData, LocalLockMutLocalData, LocalLockReadGuard,
+    LocalLockWriteGuard,
 };
 
 pub(crate) mod global_lock_atomic;
 pub use global_lock_atomic::{
     GlobalLockArray, GlobalLockByteArray, GlobalLockByteArrayWeak, GlobalLockLocalData,
-    GlobalLockMutLocalData,
+    GlobalLockMutLocalData, GlobalLockReadGuard, GlobalLockWriteGuard,
 };
 
 pub mod iterator;
-// #[doc(hidden)]
+// //#[doc(hidden)]
 pub use iterator::distributed_iterator::DistributedIterator;
-// #[doc(hidden)]
+// //#[doc(hidden)]
 pub use iterator::local_iterator::LocalIterator;
-// #[doc(hidden)]
+// //#[doc(hidden)]
 pub use iterator::one_sided_iterator::OneSidedIterator;
 
 pub(crate) mod operations;
@@ -291,10 +280,10 @@ pub enum LamellarArrayRdmaOutput<T: Dist> {
 
 impl<T: Dist> LamellarWrite for LamellarArrayRdmaOutput<T> {}
 
-#[doc(hidden)]
+/// Trait for types that can be used as output to various LamellarArray RDMA operations.
 pub trait LamellarWrite {}
 
-#[doc(hidden)]
+/// Trait for types that can be used as input to various LamellarArray RDMA operations.
 pub trait LamellarRead {}
 
 // impl<T: Dist> LamellarRead for T {}
@@ -305,7 +294,7 @@ impl<T: Dist> LamellarRead for &Vec<T> {}
 impl<T: Dist> LamellarRead for &[T] {}
 
 impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
-    /// Constructs a single element [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion] and copies `val` into it
+    /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
     fn team_from(val: &T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
         unsafe {
@@ -316,7 +305,7 @@ impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
 }
 
 impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
-    /// Constructs a single element [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion] and copies `val` into it
+    /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
         unsafe {
@@ -327,7 +316,7 @@ impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
 }
 
 impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
-    /// Constructs a [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
+    /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
         unsafe {
@@ -341,7 +330,7 @@ impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
     }
 }
 impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
-    /// Constructs a [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
+    /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: &Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
         unsafe {
@@ -355,7 +344,7 @@ impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
     }
 }
 impl<T: Dist> TeamFrom<&[T]> for LamellarArrayRdmaInput<T> {
-    /// Constructs a [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
+    /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: &[T], team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
         unsafe {
@@ -535,7 +524,7 @@ pub trait TeamFrom<T: ?Sized> {
 // #[async_trait]
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 /// and to be used within an async context
-pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> {
+pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> + Sized {
     /// Converts to this type from the input type
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> impl Future<Output = Self> + Send;
 }
@@ -600,15 +589,15 @@ where
 #[derive(serde::Serialize, serde::Deserialize, Clone)]
 #[serde(bound = "T: Dist + serde::Serialize + serde::de::DeserializeOwned + 'static")]
 pub enum LamellarReadArray<T: Dist + 'static> {
-    #[doc(hidden)]
+    ///
     UnsafeArray(UnsafeArray<T>),
-    #[doc(hidden)]
+    ///
     ReadOnlyArray(ReadOnlyArray<T>),
-    #[doc(hidden)]
+    ///
     AtomicArray(AtomicArray<T>),
-    #[doc(hidden)]
+    ///
     LocalLockArray(LocalLockArray<T>),
-    #[doc(hidden)]
+    ///
     GlobalLockArray(GlobalLockArray<T>),
 }
 
@@ -617,19 +606,19 @@ pub enum LamellarReadArray<T: Dist + 'static> {
 #[derive(serde::Serialize, serde::Deserialize, Clone)]
 pub enum LamellarByteArray {
     //we intentially do not include "byte" in the variant name to ease construciton in the proc macros
-    #[doc(hidden)]
+    //#[doc(hidden)]
     UnsafeArray(UnsafeByteArray),
-    #[doc(hidden)]
+    //#[doc(hidden)]
     ReadOnlyArray(ReadOnlyByteArray),
-    #[doc(hidden)]
+    //#[doc(hidden)]
     AtomicArray(AtomicByteArray),
-    #[doc(hidden)]
+    //#[doc(hidden)]
     NativeAtomicArray(NativeAtomicByteArray),
-    #[doc(hidden)]
+    //#[doc(hidden)]
     GenericAtomicArray(GenericAtomicByteArray),
-    #[doc(hidden)]
+    //#[doc(hidden)]
     LocalLockArray(LocalLockByteArray),
-    #[doc(hidden)]
+    //#[doc(hidden)]
     GlobalLockArray(GlobalLockByteArray),
 }
 
@@ -679,13 +668,13 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarReadArray
 #[derive(serde::Serialize, serde::Deserialize, Clone)]
 #[serde(bound = "T: Dist + serde::Serialize + serde::de::DeserializeOwned")]
 pub enum LamellarWriteArray<T: Dist> {
-    #[doc(hidden)]
+    ///
     UnsafeArray(UnsafeArray<T>),
-    #[doc(hidden)]
+    ///
     AtomicArray(AtomicArray<T>),
-    #[doc(hidden)]
+    ///
     LocalLockArray(LocalLockArray<T>),
-    #[doc(hidden)]
+    ///
     GlobalLockArray(GlobalLockArray<T>),
 }
 
@@ -832,7 +821,7 @@ pub(crate) mod private {
     use enum_dispatch::enum_dispatch;
     use std::pin::Pin;
     use std::sync::Arc;
-    #[doc(hidden)]
+    //#[doc(hidden)]
     #[enum_dispatch(LamellarReadArray<T>,LamellarWriteArray<T>)]
     pub trait LamellarArrayPrivate<T: Dist>: Clone {
         // // fn my_pe(&self) -> usize;
@@ -845,7 +834,7 @@ pub(crate) mod private {
         fn as_lamellar_byte_array(&self) -> LamellarByteArray;
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     #[enum_dispatch(LamellarReadArray<T>,LamellarWriteArray<T>)]
     pub(crate) trait ArrayExecAm<T: Dist> {
         fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
@@ -863,19 +852,19 @@ pub(crate) mod private {
             self.team()
                 .exec_am_pe_tg(pe, am, Some(self.team_counters()))
         }
-        fn exec_arc_am_pe<F>(&self, pe: usize, am: LamellarArcAm) -> AmHandle<F>
-        where
-            F: AmDist,
-        {
-            self.team()
-                .exec_arc_am_pe(pe, am, Some(self.team_counters()))
-        }
-        fn exec_am_all<F>(&self, am: F) -> MultiAmHandle<F::Output>
-        where
-            F: RemoteActiveMessage + LamellarAM + AmDist,
-        {
-            self.team().exec_am_all_tg(am, Some(self.team_counters()))
-        }
+        // fn exec_arc_am_pe<F>(&self, pe: usize, am: LamellarArcAm) -> AmHandle<F>
+        // where
+        //     F: AmDist,
+        // {
+        //     self.team()
+        //         .exec_arc_am_pe(pe, am, Some(self.team_counters()))
+        // }
+        // fn exec_am_all<F>(&self, am: F) -> MultiAmHandle<F::Output>
+        // where
+        //     F: RemoteActiveMessage + LamellarAM + AmDist,
+        // {
+        //     self.team().exec_am_all_tg(am, Some(self.team_counters()))
+        // }
     }
 }
 
@@ -1502,7 +1491,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// This trait exposes a few common reductions implemented by the runtime
 /// as well as the ability the launch user defined reductions that have been registered with the runtime at compile time
 ///
-/// Please see the documentation for the [register_reduction][lamellar_impl::register_reduction] procedural macro for
+/// Please see the documentation for the [register_reduction] procedural macro for
 /// more details and examples on how to create your own reductions.
 ///
 /// Currently these are one sided reductions, meaning the calling PE will initiate the reduction, and launch the appropriate Active Messages
@@ -1625,11 +1614,12 @@ pub trait LamellarArrayReduce<T>: LamellarArrayInternalGet<T>
 where
     T: Dist + AmDist + 'static,
 {
+    /// The Handle type returned by the reduce operation
     type Handle;
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
     ///
-    /// Please see the documentation for the [register_reduction][lamellar_impl::register_reduction] procedural macro for
+    /// Please see the documentation for the [register_reduction] procedural macro for
     /// more details and examples on how to create your own reductions.
     ///
     /// # One-sided Operation
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 2890dce1..afa4ae27 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -36,7 +36,6 @@ use std::ops::{
     ShrAssign, SubAssign,
 };
 
-// #[doc(hidden)]
 /// An abstraction of an atomic element either via language supported Atomic integer types or through the use of an accompanying mutex.
 ///
 /// This type is returned when iterating over an AtomicArray as well as when accessing local elements through an [AtomicLocalData] handle.
@@ -559,7 +558,7 @@ pub enum AtomicByteArray {
 }
 
 impl AtomicByteArray {
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn downgrade(array: &AtomicByteArray) -> AtomicByteArrayWeak {
         match array {
             AtomicByteArray::NativeAtomicByteArray(array) => {
@@ -601,7 +600,7 @@ pub enum AtomicByteArrayWeak {
 }
 
 impl AtomicByteArrayWeak {
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn upgrade(&self) -> Option<AtomicByteArray> {
         match self {
             AtomicByteArrayWeak::NativeAtomicByteArrayWeak(array) => {
@@ -1168,6 +1167,44 @@ impl<T: Dist> From<AtomicByteArray> for AtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> AtomicArray<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    ///  # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
+    /// assert_eq!(array.len()*num_pes,sum);
+    ///```
     pub fn reduce(&self, reduction: &str) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
@@ -1177,12 +1214,84 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    ///  # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let sum = array.block_on(array.sum());
+    /// assert_eq!(array.len()*num_pes,sum);
+    /// ```
     pub fn sum(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.sum(),
             AtomicArray::GenericAtomicArray(array) => array.sum(),
         }
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
+    ///     elem.store(i+1);
+    /// });
+    /// array.wait_all();
+    /// array.barrier();
+    /// let prod =  array.block_on(array.prod());
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
     pub fn prod(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.prod(),
@@ -1191,12 +1300,74 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let max = array.block_on(array.max());
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
     pub fn max(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.max(),
             AtomicArray::GenericAtomicArray(array) => array.max(),
         }
     }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let min = array.block_on(array.min());
+    /// assert_eq!(0,min);
+    ///```
     pub fn min(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.min(),
diff --git a/src/array/atomic/rdma.rs b/src/array/atomic/rdma.rs
index b4a0ff81..75241f5a 100644
--- a/src/array/atomic/rdma.rs
+++ b/src/array/atomic/rdma.rs
@@ -4,41 +4,41 @@ use crate::array::LamellarWrite;
 use crate::array::*;
 use crate::memregion::Dist;
 
-type GetFn = fn(AtomicByteArray, usize, usize) -> LamellarArcAm;
-#[doc(hidden)]
-pub struct AtomicArrayGet {
-    pub id: TypeId,
-    pub op: GetFn,
-}
-crate::inventory::collect!(AtomicArrayGet);
-lazy_static! {
-    pub(crate) static ref GET_OPS: HashMap<TypeId, GetFn> = {
-        let mut map = HashMap::new();
-        for get in crate::inventory::iter::<AtomicArrayGet> {
-            map.insert(get.id.clone(),get.op);
-        }
-        map
-        // map.insert(TypeId::of::<f64>(), f64_add::add as AddFn );
-    };
-}
+// type GetFn = fn(AtomicByteArray, usize, usize) -> LamellarArcAm;
+// //#[doc(hidden)]
+// pub(crate) struct AtomicArrayGet {
+//     pub id: TypeId,
+//     pub op: GetFn,
+// }
+// crate::inventory::collect!(AtomicArrayGet);
+// lazy_static! {
+//     pub(crate) static ref GET_OPS: HashMap<TypeId, GetFn> = {
+//         let mut map = HashMap::new();
+//         for get in crate::inventory::iter::<AtomicArrayGet> {
+//             map.insert(get.id.clone(),get.op);
+//         }
+//         map
+//         // map.insert(TypeId::of::<f64>(), f64_add::add as AddFn );
+//     };
+// }
 
-type PutFn = fn(AtomicByteArray, usize, usize, Vec<u8>) -> LamellarArcAm;
-#[doc(hidden)]
-pub struct AtomicArrayPut {
-    pub id: TypeId,
-    pub op: PutFn,
-}
-crate::inventory::collect!(AtomicArrayPut);
-lazy_static! {
-    pub(crate) static ref PUT_OPS: HashMap<TypeId, PutFn> = {
-        let mut map = HashMap::new();
-        for put in crate::inventory::iter::<AtomicArrayPut> {
-            map.insert(put.id.clone(),put.op);
-        }
-        map
-        // map.insert(TypeId::of::<f64>(), f64_add::add as AddFn );
-    };
-}
+// type PutFn = fn(AtomicByteArray, usize, usize, Vec<u8>) -> LamellarArcAm;
+// //#[doc(hidden)]
+// pub(crate) struct AtomicArrayPut {
+//     pub id: TypeId,
+//     pub op: PutFn,
+// }
+// crate::inventory::collect!(AtomicArrayPut);
+// lazy_static! {
+//     pub(crate) static ref PUT_OPS: HashMap<TypeId, PutFn> = {
+//         let mut map = HashMap::new();
+//         for put in crate::inventory::iter::<AtomicArrayPut> {
+//             map.insert(put.id.clone(),put.op);
+//         }
+//         map
+//         // map.insert(TypeId::of::<f64>(), f64_add::add as AddFn );
+//     };
+// }
 
 impl<T: Dist> LamellarArrayGet<T> for AtomicArray<T> {
     unsafe fn get<U: TeamTryInto<LamellarArrayRdmaOutput<T>> + LamellarWrite>(
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index ed31f5f4..98e2bd3d 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -273,7 +273,7 @@ pub struct GenericAtomicByteArray {
 }
 
 impl GenericAtomicByteArray {
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn lock_index(&self, index: usize) -> MutexGuard<()> {
         let index = self
             .array
@@ -283,7 +283,7 @@ impl GenericAtomicByteArray {
         self.locks[index].lock()
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn downgrade(array: &GenericAtomicByteArray) -> GenericAtomicByteArrayWeak {
         GenericAtomicByteArrayWeak {
             locks: array.locks.clone(),
@@ -300,7 +300,7 @@ pub struct GenericAtomicByteArrayWeak {
 }
 
 impl GenericAtomicByteArrayWeak {
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn upgrade(&self) -> Option<GenericAtomicByteArray> {
         Some(GenericAtomicByteArray {
             locks: self.locks.clone(),
@@ -437,7 +437,7 @@ impl<T: Dist> GenericAtomicArray<T> {
         }
     }
 }
-
+#[doc(hidden)]
 impl<T: Dist> GenericAtomicArray<T> {
     // pub fn wait_all(&self) {
     //     self.array.wait_all();
@@ -457,7 +457,7 @@ impl<T: Dist> GenericAtomicArray<T> {
     //     self.array.num_elems_local()
     // }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         GenericAtomicArray {
             locks: self.locks.clone(),
@@ -469,12 +469,12 @@ impl<T: Dist> GenericAtomicArray<T> {
     //     self.array.num_pes()
     // }
 
-    // #[doc(hidden)]
+    // //#[doc(hidden)]
     // pub fn pe_for_dist_index(&self, index: usize) -> Option<usize> {
     //     self.array.pe_for_dist_index(index)
     // }
 
-    // #[doc(hidden)]
+    // //#[doc(hidden)]
     // pub fn pe_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
     //     self.array.pe_offset_for_dist_index(pe, index)
     // }
@@ -487,7 +487,7 @@ impl<T: Dist> GenericAtomicArray<T> {
     //     self.array.len()
     // }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn local_data(&self) -> GenericAtomicLocalData<T> {
         GenericAtomicLocalData {
             array: self.clone(),
@@ -496,7 +496,7 @@ impl<T: Dist> GenericAtomicArray<T> {
         }
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn mut_local_data(&self) -> GenericAtomicLocalData<T> {
         GenericAtomicLocalData {
             array: self.clone(),
@@ -505,11 +505,11 @@ impl<T: Dist> GenericAtomicArray<T> {
         }
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub unsafe fn __local_as_slice(&self) -> &[T] {
         self.array.local_as_mut_slice()
     }
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub unsafe fn __local_as_mut_slice(&self) -> &mut [T] {
         self.array.local_as_mut_slice()
     }
@@ -521,31 +521,31 @@ impl<T: Dist> GenericAtomicArray<T> {
     //     }
     // }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn into_unsafe(self) -> UnsafeArray<T> {
         // println!("generic into_unsafe");
         self.array.into()
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn into_read_only(self) -> ReadOnlyArray<T> {
         // println!("generic into_read_only");
         self.array.into()
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn into_local_lock(self) -> LocalLockArray<T> {
         // println!("generic into_local_lock");
         self.array.into()
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn into_global_lock(self) -> GlobalLockArray<T> {
         // println!("generic into_local_lock");
         self.array.into()
     }
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     pub fn lock_index(&self, index: usize) -> MutexGuard<()> {
         // if let Some(ref locks) = *self.locks {
         //     let start_index = (index * std::mem::size_of::<T>()) / self.orig_t_size;
@@ -567,9 +567,9 @@ impl<T: Dist> GenericAtomicArray<T> {
         self.locks[index].lock()
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-        self.array.async_barrier()
-    }
+    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    //     self.array.async_barrier()
+    // }
 }
 
 impl<T: Dist + 'static> GenericAtomicArray<T> {
@@ -825,22 +825,27 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GenericAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> GenericAtomicArray<T> {
+    #[doc(hidden)]
     pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GenericAtomicArray<T> {
+    #[doc(hidden)]
     pub fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
+    #[doc(hidden)]
     pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GenericAtomicArray<T> {
+    #[doc(hidden)]
     pub fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
+    #[doc(hidden)]
     pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index ded8bf2d..6cdf4910 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -12,7 +12,7 @@ use crate::memregion::Dist;
 //     RawRwLock,
 // };
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct GenericAtomicDistIter<T: Dist> {
     data: GenericAtomicArray<T>,
@@ -42,7 +42,7 @@ impl<T: Dist> std::fmt::Debug for GenericAtomicDistIter<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct GenericAtomicLocalIter<T: Dist> {
     data: GenericAtomicArray<T>,
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 9486625e..ac05b10a 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -251,6 +251,7 @@ impl<'a, T: Dist> IntoIterator for &'a GlobalLockLocalData<T> {
     }
 }
 
+/// Captures a read lock on the array, allowing immutable access to the underlying data
 #[derive(Clone)]
 pub struct GlobalLockReadGuard<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
@@ -258,6 +259,7 @@ pub struct GlobalLockReadGuard<T: Dist> {
 }
 
 impl<T: Dist> GlobalLockReadGuard<T> {
+    /// Access the underlying local data through the read lock
     pub fn local_data(&self) -> GlobalLockLocalData<T> {
         GlobalLockLocalData {
             array: self.array.clone(),
@@ -269,6 +271,7 @@ impl<T: Dist> GlobalLockReadGuard<T> {
     }
 }
 
+/// Captures a write lock on the array, allowing mutable access to the underlying data
 pub struct GlobalLockWriteGuard<T: Dist> {
     pub(crate) array: GlobalLockArray<T>,
     lock_guard: GlobalRwDarcWriteGuard<()>,
@@ -284,6 +287,7 @@ impl<T: Dist> From<GlobalLockMutLocalData<T>> for GlobalLockWriteGuard<T> {
 }
 
 impl<T: Dist> GlobalLockWriteGuard<T> {
+    /// Access the underlying local data through the write lock
     pub fn local_data(self) -> GlobalLockMutLocalData<T> {
         GlobalLockMutLocalData {
             array: self.array.clone(),
@@ -345,6 +349,27 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return a global read lock guard on the calling PE
+    ///
+    /// this function will block the thread until the lock is acquired
+    /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::GlobalLockArray::read_lock) instead.
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
+    ///
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let read_lock = array.blocking_read_lock();
+    /// //do interesting work
+    ///
+    ///```
     pub fn blocking_read_lock(&self) -> GlobalLockReadGuard<T> {
         let self_clone: GlobalLockArray<T> = self.clone();
         self.block_on(async move {
@@ -355,6 +380,26 @@ impl<T: Dist> GlobalLockArray<T> {
         })
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return a global read lock guard on the calling PE
+    ///
+    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// world.block_on(async move {
+    ///     let read_lock = array.read_lock().await;
+    ///     //do interesting work
+    /// });
+    ///```
     pub async fn read_lock(&self) -> GlobalLockReadGuard<T> {
         GlobalLockReadGuard {
             array: self.clone(),
@@ -362,6 +407,27 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return a global write lock guard on the calling PE
+    ///
+    /// this function will block the thread until the lock is acquired
+    /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::GlobalLockArray::write_lock) instead.
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
+    ///
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let write_lock = array.blocking_write_lock();
+    /// //do interesting work
+    ///
+    ///```
     pub fn blocking_write_lock(&self) -> GlobalLockWriteGuard<T> {
         let self_clone: GlobalLockArray<T> = self.clone();
         self.block_on(async move {
@@ -372,6 +438,26 @@ impl<T: Dist> GlobalLockArray<T> {
         })
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return a global write lock guard on the calling PE
+    ///
+    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// world.block_on(async move {
+    ///     let write_lock = array.write_lock().await;
+    ///     //do interesting work
+    /// });
+    ///```
     pub async fn write_lock(&self) -> GlobalLockWriteGuard<T> {
         GlobalLockWriteGuard {
             array: self.clone(),
@@ -728,9 +814,9 @@ impl<T: Dist> GlobalLockArray<T> {
         self.array.into()
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-        self.array.async_barrier()
-    }
+    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    //     self.array.async_barrier()
+    // }
 }
 
 impl<T: Dist + 'static> GlobalLockArray<T> {
@@ -1026,7 +1112,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GlobalLockArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct GlobalLockArrayReduceHandle<T: Dist + AmDist> {
     req: AmHandle<Option<T>>,
@@ -1057,6 +1143,31 @@ impl<T: Dist + AmDist> Future for GlobalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = array.block_on(read_guard.reduce("prod"));
+    ///```
     pub fn reduce(self, op: &str) -> GlobalLockArrayReduceHandle<T> {
         GlobalLockArrayReduceHandle {
             req: self.array.array.reduce_data(op, self.array.clone().into()),
@@ -1065,17 +1176,111 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let sum = array.block_on(read_guard.sum());
+    /// ```
     pub fn sum(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("sum")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = array.block_on(read_guard.prod());
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
     pub fn prod(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuard<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let max = array.block_on(read_guard.max());
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
     pub fn max(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("max")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let min = array.block_on(read_guard.min());
+    /// assert_eq!(0,min);
+    ///```
     pub fn min(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 81354f24..a5f642ff 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -10,7 +10,7 @@ use crate::array::*;
 use crate::darc::global_rw_darc::GlobalRwDarcReadGuard;
 use crate::memregion::Dist;
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct GlobalLockDistIter<T: Dist> {
     data: GlobalLockArray<T>,
@@ -44,7 +44,7 @@ impl<T: Dist> std::fmt::Debug for GlobalLockDistIter<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct GlobalLockLocalIter<T: Dist> {
     data: GlobalLockArray<T>,
diff --git a/src/array/handle.rs b/src/array/handle.rs
index 1d912fc9..b2c24ecd 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -13,6 +13,7 @@ use crate::{
     Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
 };
 
+/// a task handle for an array rdma (put/get) operation
 pub struct ArrayRdmaHandle {
     pub(crate) reqs: VecDeque<AmHandle<()>>,
 }
@@ -50,6 +51,7 @@ impl Future for ArrayRdmaHandle {
     }
 }
 
+/// a task handle for an array rdma 'at' operation
 #[pin_project]
 pub struct ArrayRdmaAtHandle<T: Dist> {
     pub(crate) req: Option<LocalAmHandle<()>>,
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 3f06b7f2..d1ce490b 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -52,7 +52,7 @@ use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub struct DistIterForEachHandle {
 //     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
 // }
@@ -63,7 +63,7 @@ use std::sync::Arc;
 //     }
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // #[async_trait]
 // impl IterRequest for DistIterForEachHandle {
 //     type Output = ();
@@ -79,7 +79,7 @@ use std::sync::Arc;
 //     }
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
 //     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
 //     pub(crate) distribution: Distribution,
@@ -251,16 +251,16 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps + std::iter::Sum;
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
 
-    // #[doc(hidden)]
+    // //#[doc(hidden)]
     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)>;
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
 }
 
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 156baa60..ece9e23e 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -18,7 +18,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct Collect<I, A> {
+pub(crate) struct Collect<I, A> {
     pub(crate) iter: Monotonic<I>,
     pub(crate) distribution: Distribution,
     pub(crate) _phantom: PhantomData<A>,
@@ -78,7 +78,7 @@ where
 }
 
 #[derive(Debug)]
-pub struct CollectAsync<I, A, B> {
+pub(crate) struct CollectAsync<I, A, B> {
     pub(crate) iter: Monotonic<I>,
     pub(crate) distribution: Distribution,
     pub(crate) _phantom: PhantomData<(A, B)>,
@@ -154,7 +154,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct DistIterCollectHandle<
     T: Dist + ArrayOps,
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index ed426ffa..021df215 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -21,7 +21,7 @@ use std::sync::{
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct Count<I> {
+pub(crate) struct Count<I> {
     pub(crate) iter: I,
 }
 
@@ -71,7 +71,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct DistIterCountHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
@@ -155,7 +155,7 @@ impl Future for DistIterCountHandle {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[async_trait]
 impl LamellarRequest for DistIterCountHandle {
     fn blocking_wait(mut self) -> Self::Output {
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index 5de9679c..dd202970 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -13,7 +13,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct ForEach<I, F>
+pub(crate) struct ForEach<I, F>
 where
     I: DistributedIterator + 'static,
     F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -73,7 +73,7 @@ where
 }
 
 #[derive(Debug)]
-pub struct ForEachAsync<I, F, Fut>
+pub(crate) struct ForEachAsync<I, F, Fut>
 where
     I: DistributedIterator + 'static,
     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -154,7 +154,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub struct DistIterForEachHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
 }
@@ -172,7 +172,7 @@ impl Future for DistIterForEachHandle {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl LamellarRequest for DistIterForEachHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(..) {
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 348f31f4..7715c79f 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -17,7 +17,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct Reduce<I, F> {
+pub(crate) struct Reduce<I, F> {
     pub(crate) iter: I,
     pub(crate) op: F,
 }
@@ -74,7 +74,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct DistIterReduceHandle<T, F> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Option<T>>>,
@@ -182,7 +182,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T, F> LamellarRequest for DistIterReduceHandle<T, F>
 where
     T: Dist + ArrayOps,
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index f6822835..affe38e2 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -15,7 +15,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct Sum<I> {
+pub(crate) struct Sum<I> {
     pub(crate) iter: I,
 }
 
@@ -66,7 +66,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct DistIterSumHandle<T> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
@@ -158,7 +158,7 @@ where
         }
     }
 }
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T> LamellarRequest for DistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 9c51120b..cf389e92 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -134,13 +134,13 @@ pub trait LocalIteratorLauncher {
         I: LocalIterator + 'static,
         I::Item: SyncSend + std::iter::Sum;
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
 
-    #[doc(hidden)]
+    //#[doc(hidden)]
     fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
 }
 
@@ -699,6 +699,13 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     ///
     /// Returns an iterator that itself returns [Iterator]s over the chunked slices of the array.
     /// If the number of elements is not evenly divisible by `size`, the last chunk may be shorter than `size`
+    ///
+    /// # Note
+    /// If calling this on a LocalLockArray it may be possible to call [blocking_read_local_chunks](crate::array::LocalLockArray::blocking_read_local_chunks), [read_local_chunks](crate::array::LocalLockArray::read_local_chunks)
+    /// [blocking_write_local_chunks](crate::array::LocalLockArray::blocking_write_local_chunks), or [write_local_chunks](crate::array::LocalLockArray::blocking_write_local_chunks) for better performance
+    ///
+    /// If calling this on an UnsafeArray it may be possible to call [local_chunks](crate::array::UnsafeArray::local_chunks) or [local_chunks_mut](crate::array::UnsafeArray::local_chunks_mut)
+    ///
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index ae96c6e6..221747d2 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -18,7 +18,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct Collect<I, A> {
+pub(crate) struct Collect<I, A> {
     pub(crate) iter: Monotonic<I>,
     pub(crate) distribution: Distribution,
     pub(crate) _phantom: PhantomData<A>,
@@ -78,7 +78,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct LocalIterCollectHandle<
     T: Dist + ArrayOps,
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index 9482dee2..a1f8d191 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -13,7 +13,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct Count<I> {
+pub(crate) struct Count<I> {
     pub(crate) iter: I,
 }
 
@@ -62,7 +62,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct LocalIterCountHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
@@ -92,7 +92,7 @@ impl Future for LocalIterCountHandle {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl LamellarRequest for LocalIterCountHandle {
     fn blocking_wait(mut self) -> Self::Output {
         self.reqs
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index 7abdc16c..013bfab2 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -13,7 +13,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct ForEach<I, F>
+pub(crate) struct ForEach<I, F>
 where
     I: LocalIterator + 'static,
     F: Fn(I::Item) + SyncSend + Clone + 'static,
@@ -76,7 +76,7 @@ where
 }
 
 #[derive(Debug)]
-pub struct ForEachAsync<I, F, Fut>
+pub(crate) struct ForEachAsync<I, F, Fut>
 where
     I: LocalIterator + 'static,
     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
@@ -154,7 +154,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub struct LocalIterForEachHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
 }
@@ -172,7 +172,7 @@ impl Future for LocalIterForEachHandle {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl LamellarRequest for LocalIterForEachHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(..) {
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index a10a5113..305921bf 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -14,7 +14,7 @@ use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
 
 #[derive(Clone, Debug)]
-pub struct Reduce<I, F> {
+pub(crate) struct Reduce<I, F> {
     pub(crate) iter: I,
     pub(crate) op: F,
 }
@@ -70,7 +70,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct LocalIterReduceHandle<T, F> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Option<T>>>,
@@ -112,7 +112,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T, F> LamellarRequest for LocalIterReduceHandle<T, F>
 where
     T: SyncSend + Copy + 'static,
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index 4908e2db..d1ca2bbe 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -64,7 +64,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct LocalIterSumHandle<T> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
@@ -105,7 +105,7 @@ where
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T> LamellarRequest for LocalIterSumHandle<T>
 where
     T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
diff --git a/src/array/iterator/mod.rs b/src/array/iterator/mod.rs
index 9ee9a4b2..57d59234 100644
--- a/src/array/iterator/mod.rs
+++ b/src/array/iterator/mod.rs
@@ -9,7 +9,7 @@ pub mod consumer;
 
 use crate::memregion::Dist;
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // #[async_trait]
 // pub trait IterRequest {
 //     type Output;
@@ -53,15 +53,15 @@ pub enum Schedule {
 ///
 /// This is only implemented for Safe Array types, [UnsafeArray][crate::array::UnsafeArray] directly provides unsafe versions of the same functions
 pub trait LamellarArrayIterators<T: Dist> {
-    /// The [DistributedIterator][crate::array::DistributedIterator] type
+    /// The [DistributedIterator] type
     type DistIter: DistributedIterator;
-    /// The [LocalIterator][crate::array::LocalIterator] type
+    /// The [LocalIterator] type
     type LocalIter: LocalIterator;
-    /// The [OneSidedIterator][crate::array::OneSidedIterator] type
+    /// The [OneSidedIterator] type
     type OnesidedIter: OneSidedIterator;
 
     #[doc(alias = "Collective")]
-    /// Create an immutable [DistributedIterator][crate::array::DistributedIterator] for this array
+    /// Create an immutable [DistributedIterator] for this array
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the array to enter the call otherwise deadlock will occur (i.e. barriers are being called internally)
@@ -81,7 +81,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     fn dist_iter(&self) -> Self::DistIter;
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Create an immutable [LocalIterator][crate::array::LocalIterator] for this array
+    /// Create an immutable [LocalIterator] for this array
     ///
     /// # One-sided Operation
     /// The iteration is launched and local to only the calling PE.
@@ -101,7 +101,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     fn local_iter(&self) -> Self::LocalIter;
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Create an immutable [OneSidedIterator][crate::array::OneSidedIterator] for this array
+    /// Create an immutable [OneSidedIterator] for this array
     ///
     /// # One-sided Operation
     /// The iteration is launched and local to only the calling PE.
@@ -123,7 +123,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     fn onesided_iter(&self) -> Self::OnesidedIter;
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Create an immutable [OneSidedIterator][crate::array::OneSidedIterator] for this array
+    /// Create an immutable [OneSidedIterator]  for this array
     /// which will transfer and buffer `buf_size` elements at a time (to more efficient utilize the underlying lamellae network)
     ///
     /// The buffering is transparent to the user.
@@ -154,13 +154,13 @@ pub trait LamellarArrayIterators<T: Dist> {
 ///
 /// This is only implemented for Safe Array types, [UnsafeArray][crate::array::UnsafeArray] directly provides unsafe versions of the same functions
 pub trait LamellarArrayMutIterators<T: Dist> {
-    /// The [DistributedIterator][crate::array::DistributedIterator] type
+    /// The [DistributedIterator] type
     type DistIter: DistributedIterator;
-    /// The [LocalIterator][crate::array::LocalIterator] type
+    /// The [LocalIterator]type
     type LocalIter: LocalIterator;
 
     #[doc(alias = "Collective")]
-    /// Create a mutable [DistributedIterator][crate::array::DistributedIterator] for this array
+    /// Create a mutable [DistributedIterator] for this array
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the array to enter the call otherwise deadlock will occur (i.e. barriers are being called internally)
@@ -180,7 +180,7 @@ pub trait LamellarArrayMutIterators<T: Dist> {
     fn dist_iter_mut(&self) -> Self::DistIter;
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Create a mutable [LocalIterator][crate::array::LocalIterator] for this array
+    /// Create a mutable [LocalIterator] for this array
     ///
     /// # One-sided Operation
     /// The iteration is launched and local to only the calling PE.
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index bb43c1cf..633f4949 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -243,7 +243,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     //     Buffered::new(self, buf_size)
     // }
 
-    /// Convert self one-sided iterator into a standard Rust Iterator, enabling one to use any of the functions available on `Iterator`s
+    /// Convert a one-sided iterator into a standard Rust [Iterator], enabling one to use any of the functions available on `Iterator`s
     ///
     /// # Examples
     ///```
@@ -279,6 +279,28 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
         OneSidedIteratorIter { iter: self }
     }
 
+    /// Convert a one-sided iterator into a standard Rust [Stream] for iteration in async contexts, enabling one to use any of the functions available on `Stream`s
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array using a distributed iterator
+    /// array.wait_all();
+    /// world.block_on (async move {
+    ///     if my_pe == 0 {
+    ///         let sum = array.onesided_iter().into_stream().take(4).map(|elem| *elem as f64).sum::<f64>().await;
+    ///         println!("Sum: {sum}")
+    ///     }
+    /// });
+    /// ```
+    ///  Output on a 4 PE execution
+    ///```text
+    /// Sum: 2.0
+    ///```
     fn into_stream(mut self) -> OneSidedStream<Self>
     where
         Self: Sized + Send,
@@ -289,9 +311,9 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     }
 }
 
-/// An immutable standard Rust Iterator backed by a [OneSidedIterator](crate::array::iterator::one_sided_iterator).
+/// An immutable standard Rust [Iterator] backed by a [OneSidedIterator](crate::array::iterator::one_sided_iterator).
 ///
-/// This object iterates over data serially on a single PE ; compare with [distributed iterators](crate::array::iterator::distributed_iterator), which iterate over data in on all PEs associate with the array.
+/// This object iterates over data serially on a single PE ; compare with [distributed iterators](crate::array::iterator::distributed_iterator), which iterate over data on all PEs associate with the array.
 ///
 /// This struct is created by calling [into_iter][OneSidedIterator::into_iter] a OneSidedIterator
 ///
@@ -321,6 +343,26 @@ where
     }
 }
 
+/// An immutable standard Rust [Stream] backed by a [OneSidedIterator](crate::array::iterator::one_sided_iterator) for iteration in async contexts.
+///
+/// This object iterates over data serially on a single PE ; compare with [distributed iterators](crate::array::iterator::distributed_iterator), which iterate over data on all PEs associate with the array.
+///
+/// This struct is created by calling [into_stream][OneSidedIterator::into_iter] a OneSidedIterator
+///
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// use futures::stream::StreamExt;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// world.block_on(async move {
+///     let stream = array.onesided_iter().into_stream();
+///     while let Some(e) = stream.next().await {
+///         println!("{e}");
+///     }
+/// });
+///```
 #[pin_project]
 pub struct OneSidedStream<I> {
     #[pin]
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 89210db3..cf127730 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -1,5 +1,6 @@
 mod iteration;
-mod local_chunks;
+pub(crate) mod local_chunks;
+pub use local_chunks::{LocalLockLocalChunks, LocalLockLocalChunksMut};
 pub(crate) mod operations;
 mod rdma;
 use crate::array::private::LamellarArrayPrivate;
@@ -224,6 +225,7 @@ impl<T: Dist> Deref for LocalLockLocalData<T> {
     }
 }
 
+/// Captures a read lock on the array, allowing immutable access to the underlying data
 #[derive(Clone)]
 pub struct LocalLockReadGuard<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
@@ -231,6 +233,7 @@ pub struct LocalLockReadGuard<T: Dist> {
 }
 
 impl<T: Dist> LocalLockReadGuard<T> {
+    /// Access the underlying local data immutably through the read lock
     pub fn local_data(&self) -> LocalLockLocalData<T> {
         LocalLockLocalData {
             array: self.array.clone(),
@@ -242,6 +245,7 @@ impl<T: Dist> LocalLockReadGuard<T> {
     }
 }
 
+/// Captures a write lock on the array, allowing mutable access to the underlying data
 pub struct LocalLockWriteGuard<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
     lock_guard: RwLockWriteGuardArc<()>,
@@ -257,6 +261,7 @@ impl<T: Dist> From<LocalLockMutLocalData<T>> for LocalLockWriteGuard<T> {
 }
 
 impl<T: Dist> LocalLockWriteGuard<T> {
+    /// Access the underlying local data mutably through the write lock
     pub fn local_data(self) -> LocalLockMutLocalData<T> {
         LocalLockMutLocalData {
             array: self.array.clone(),
@@ -319,6 +324,26 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local read lock guard
+    ///
+    /// this function will block the thread until the lock is acquired
+    /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::LocalLockArray::read_lock) instead.
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE
+    ///
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let read_lock = array.blocking_read_lock();
+    /// //do interesting work
+    ///
     pub fn blocking_read_lock(&self) -> LocalLockReadGuard<T> {
         let self_clone: LocalLockArray<T> = self.clone();
         self.block_on(async move {
@@ -329,6 +354,26 @@ impl<T: Dist> LocalLockArray<T> {
         })
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local read lock
+    ///
+    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// world.block_on(async move {
+    ///     let read_lock = array.read_lock().await;
+    ///     //do interesting work
+    /// });
+    ///```
     pub async fn read_lock(&self) -> LocalLockReadGuard<T> {
         LocalLockReadGuard {
             array: self.clone(),
@@ -336,6 +381,26 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local write lock guard
+    ///
+    /// this function will block the thread until the lock is acquired
+    /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::LocalLockArray::write_lock) instead.
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE
+    ///
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// let write_lock = array.blocking_write_lock();
+    /// //do interesting work
+    ///
     pub fn blocking_write_lock(&self) -> LocalLockWriteGuard<T> {
         let self_clone: LocalLockArray<T> = self.clone();
         self.block_on(async move {
@@ -346,6 +411,26 @@ impl<T: Dist> LocalLockArray<T> {
         })
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Return the calling PE's local write lock
+    ///
+    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    ///
+    /// # One-sided Operation
+    /// Only explictly requires the calling PE
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    ///
+    /// world.block_on(async move {
+    ///     let write_lock = array.write_lock().await;
+    ///     //do interesting work
+    /// });
+    ///```
     pub async fn write_lock(&self) -> LocalLockWriteGuard<T> {
         LocalLockWriteGuard {
             array: self.clone(),
@@ -629,9 +714,9 @@ impl<T: Dist> LocalLockArray<T> {
         self.array.into()
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-        self.array.async_barrier()
-    }
+    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    //     self.array.async_barrier()
+    // }
 }
 
 impl<T: Dist + 'static> LocalLockArray<T> {
@@ -924,7 +1009,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[pin_project]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
     req: AmHandle<Option<T>>,
@@ -955,6 +1040,35 @@ impl<T: Dist + AmDist> Future for LocalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// Lamellar converting to a [ReadOnlyArray] or [GlobalLockArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = array.block_on(read_guard.reduce("prod"));
+    ///```
     pub fn reduce(self, op: &str) -> LocalLockArrayReduceHandle<T> {
         LocalLockArrayReduceHandle {
             req: self.array.array.reduce_data(op, self.array.clone().into()),
@@ -963,17 +1077,117 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let sum = array.block_on(read_guard.sum());
+    /// ```
     pub fn sum(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("sum")
     }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = array.block_on(read_guard.prod());
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
     pub fn prod(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let max = array.block_on(read_guard.max());
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
     pub fn max(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("max")
     }
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let min = array.block_on(read_guard.min());
+    /// assert_eq!(0,min);
+    ///```
     pub fn min(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 8cf460e9..23b249e7 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -14,7 +14,7 @@ use crate::memregion::Dist;
 // };
 use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
     data: LocalLockArray<T>,
@@ -48,7 +48,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIter<'a, T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct LocalLockLocalIter<'a, T: Dist> {
     data: LocalLockArray<T>,
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index 3a521028..5d4328e5 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -6,6 +6,8 @@ use crate::memregion::Dist;
 
 use std::sync::Arc;
 
+/// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of a [LocalLockArray]
+/// This struct is created by calling [LocalLockArray::read_local_chunks] or [LocalLockArray::blocking_read_local_chunks]
 #[derive(Clone)]
 pub struct LocalLockLocalChunks<T: Dist> {
     chunk_size: usize,
@@ -29,6 +31,8 @@ impl<T: Dist> IterClone for LocalLockLocalChunks<T> {
     }
 }
 
+/// An iterator over mutable (nonoverlapping) local chunks (of size chunk_size) of a [LocalLockArray]
+/// This struct is created by calling [LocalLockArray""write_local_chunks] or [LocalLockArray::blocking_write_local_chunks]
 pub struct LocalLockLocalChunksMut<T: Dist> {
     // data: &'a mut [T],
     chunk_size: usize,
@@ -211,6 +215,24 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalChunksMut<T> {
 }
 
 impl<T: Dist> LocalLockArray<T> {
+    /// mutably iterate over fixed sized chunks(slices) of the local data of this array.
+    /// the returned iterator is a lamellar [LocalIterator] and also captures a read lock on the local data.
+    /// This call will block the calling task until a read lock is acquired.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    /// world.block_on(async move {
+    ///     array.read_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
+    ///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    ///     });
+    ///     array.await_all().await;
+    /// });
+    /// ```
     pub async fn read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
         let lock = Arc::new(self.lock.read().await);
         LocalLockLocalChunks {
@@ -223,6 +245,25 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
+    /// immutably iterate over fixed sized chunks(slices) of the local data of this array.
+    /// the returned iterator is a lamellar [LocalIterator] and also captures a read lock on the local data.
+    /// This call will block the calling thread until a read lock is acquired.
+    /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::LocalLockArray::read_local_chunks) instead.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    ///
+    /// array.blocking_read_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    /// });
+    /// array.wait_all();
+    ///
+    /// ```
     pub fn blocking_read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
         let lock = Arc::new(self.array.block_on(self.lock.read()));
         LocalLockLocalChunks {
@@ -235,6 +276,24 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
+    /// mutably iterate over fixed sized chunks(slices) of the local data of this array.
+    /// the returned iterator is a lamellar [LocalIterator] and also captures the write lock on the local data.
+    /// This call will block the calling task until the write lock is acquired.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    /// world.block_on(async move {
+    ///     array.write_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
+    ///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    ///     });
+    ///     array.await_all().await;
+    /// });
+    /// ```
     pub async fn write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
         let lock = Arc::new(self.lock.write().await);
         LocalLockLocalChunksMut {
@@ -247,6 +306,25 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
+    /// mutably iterate over fixed sized chunks(slices) of the local data of this array.
+    /// the returned iterator is a lamellar [LocalIterator] and also captures the write lock on the local data.
+    /// This call will block the calling thread until the write lock is acquired.
+    /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::LocalLockArray::write_local_chunks) instead.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    ///
+    /// array.blocking_write_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    /// });
+    /// array.wait_all();
+    ///
+    /// ```
     pub fn blocking_write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
         let lock = Arc::new(self.array.block_on(self.lock.write()));
         LocalLockLocalChunksMut {
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 1b971f85..065fa127 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -15,36 +15,10 @@ use std::ops::{
     AddAssign, BitAndAssign, BitOrAssign, BitXorAssign, DivAssign, MulAssign, RemAssign, ShlAssign,
     ShrAssign, SubAssign,
 };
-// use std::ops::{Deref, DerefMut};
-
-#[doc(hidden)]
-pub trait NativeAtomic {}
-
-#[doc(hidden)]
-pub trait AsNativeAtomic {
-    type Atomic;
-    fn as_native_atomic(&self) -> &Self::Atomic;
-}
 
 macro_rules! impl_atomic_ops{
     { $A:ty, $B:ty , $C:ident} => {
-        impl NativeAtomic for $A{}
-        impl AsNativeAtomic for $A {
-            // there is an equivalent call in nightly rust
-            // Self::Atomic::from_mut()... we will switch to that once stablized;
-            type Atomic = $B;
-            fn as_native_atomic(&self) -> &Self::Atomic{
-                use std::mem::align_of;
-                let [] = [(); align_of::<$B>() - align_of::<$A>()];
-                // SAFETY:
-                //  - the mutable reference guarantees unique ownership.
-                //  - the alignment of `$int_type` and `Self` is the
-                //    same, as promised by $cfg_align and verified above.
-                unsafe { &*(self as *const $A as *mut $A as *mut Self::Atomic) }
-            }
-        }
-        #[doc(hidden)]
-        pub struct $C<'a>(pub &'a $B);
+        pub(crate) struct $C<'a>(pub(crate) &'a $B);
         impl AddAssign<$A> for $C<'_>{
             fn add_assign(&mut self, val: $A) {
                self.0.fetch_add(val,Ordering::SeqCst);
@@ -637,7 +611,7 @@ macro_rules! impl_compare_exchange_eps {
     };
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub struct NativeAtomicElement<T> {
     array: NativeAtomicArray<T>,
     local_index: usize,
@@ -990,9 +964,9 @@ impl<T: Dist> NativeAtomicArray<T> {
         self.array.into()
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-        self.array.async_barrier()
-    }
+    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    //     self.array.async_barrier()
+    // }
 }
 
 impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
@@ -1012,7 +986,7 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicA
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("native from unsafe");
@@ -1025,7 +999,7 @@ impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for NativeAtomicArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
@@ -1041,7 +1015,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> From<NativeAtomicArray<T>> for NativeAtomicByteArray {
     fn from(array: NativeAtomicArray<T>) -> Self {
         NativeAtomicByteArray {
@@ -1051,7 +1025,7 @@ impl<T: Dist> From<NativeAtomicArray<T>> for NativeAtomicByteArray {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> From<NativeAtomicArray<T>> for LamellarByteArray {
     fn from(array: NativeAtomicArray<T>) -> Self {
         LamellarByteArray::NativeAtomicArray(NativeAtomicByteArray {
@@ -1061,7 +1035,7 @@ impl<T: Dist> From<NativeAtomicArray<T>> for LamellarByteArray {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> From<LamellarByteArray> for NativeAtomicArray<T> {
     fn from(array: LamellarByteArray) -> Self {
         if let LamellarByteArray::NativeAtomicArray(array) = array {
@@ -1072,7 +1046,7 @@ impl<T: Dist> From<LamellarByteArray> for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> From<NativeAtomicArray<T>> for AtomicByteArray {
     fn from(array: NativeAtomicArray<T>) -> Self {
         AtomicByteArray::NativeAtomicByteArray(NativeAtomicByteArray {
@@ -1082,7 +1056,7 @@ impl<T: Dist> From<NativeAtomicArray<T>> for AtomicByteArray {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> From<NativeAtomicByteArray> for NativeAtomicArray<T> {
     fn from(array: NativeAtomicByteArray) -> Self {
         NativeAtomicArray {
@@ -1092,7 +1066,7 @@ impl<T: Dist> From<NativeAtomicByteArray> for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> From<NativeAtomicByteArray> for AtomicArray<T> {
     fn from(array: NativeAtomicByteArray) -> Self {
         NativeAtomicArray {
@@ -1103,7 +1077,7 @@ impl<T: Dist> From<NativeAtomicByteArray> for AtomicArray<T> {
     }
 }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 impl<T: Dist> private::ArrayExecAm<T> for NativeAtomicArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt().clone()
@@ -1113,7 +1087,7 @@ impl<T: Dist> private::ArrayExecAm<T> for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> private::LamellarArrayPrivate<T> for NativeAtomicArray<T> {
     fn inner_array(&self) -> &UnsafeArray<T> {
         &self.array
@@ -1138,7 +1112,7 @@ impl<T: Dist> private::LamellarArrayPrivate<T> for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt().clone()
@@ -1197,13 +1171,13 @@ impl<T: Dist> LamellarEnv for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> LamellarWrite for NativeAtomicArray<T> {}
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> LamellarRead for NativeAtomicArray<T> {}
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist> SubArray<T> for NativeAtomicArray<T> {
     type Array = NativeAtomicArray<T>;
     fn sub_array<R: std::ops::RangeBounds<usize>>(&self, range: R) -> Self::Array {
@@ -1217,14 +1191,15 @@ impl<T: Dist> SubArray<T> for NativeAtomicArray<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist + std::fmt::Debug> NativeAtomicArray<T> {
+    #[doc(hidden)]
     pub fn print(&self) {
         self.array.print();
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for NativeAtomicArray<T> {
     fn print(&self) {
         self.array.print()
@@ -1232,29 +1207,34 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for NativeAtomicArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> NativeAtomicArray<T> {
+    #[doc(hidden)]
     pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> NativeAtomicArray<T> {
+    #[doc(hidden)]
     pub fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
+    #[doc(hidden)]
     pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> NativeAtomicArray<T> {
+    #[doc(hidden)]
     pub fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
+    #[doc(hidden)]
     pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
 }
 
 //for use within RDMA active messages to atomically read/write values
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug)]
 pub enum NativeAtomicType {
     I8,
@@ -1269,7 +1249,7 @@ pub enum NativeAtomicType {
     Usize,
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 impl NativeAtomicType {
     fn from<T: 'static>() -> NativeAtomicType {
         let t = TypeId::of::<T>();
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index 7bb9d182..d76bab0d 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -12,7 +12,7 @@ use crate::memregion::Dist;
 //     RawRwLock,
 // };
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct NativeAtomicDistIter<T: Dist> {
     data: NativeAtomicArray<T>,
@@ -42,7 +42,7 @@ impl<T: Dist> std::fmt::Debug for NativeAtomicDistIter<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Clone)]
 pub struct NativeAtomicLocalIter<T: Dist> {
     data: NativeAtomicArray<T>,
diff --git a/src/array/operations.rs b/src/array/operations.rs
index 37a1cc35..202aaa10 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -12,7 +12,9 @@ use crate::config;
 // use crate::LamellarTeamRT;
 
 pub(crate) mod handle;
-pub use handle::{ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayResultBatchOpHandle};
+pub use handle::{
+    ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayOpHandle, ArrayResultBatchOpHandle,
+};
 pub(crate) mod access;
 pub use access::{AccessOps, LocalAtomicOps};
 pub(crate) mod arithmetic;
@@ -37,9 +39,6 @@ pub use shift::{ElementShiftOps, LocalShiftOps, ShiftOps};
 // use std::sync::Arc;
 use std::u8;
 
-#[doc(hidden)]
-pub static OPS_BUFFER_SIZE: usize = 10_000_000;
-
 /// A marker trait for types that can be used as an array
 /// Users should not implement this directly, rather they should use the [trait@ArrayOps] derive macro
 /// by passing it as an argument to the [macro@crate::active_messaging::AmData] attribute macro to automatically derive this trait.
@@ -454,7 +453,7 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a [T] {
         let len = self.len();
         let mut iters = vec![];
         if len == 0 {
-            return (iters,len)
+            return (iters, len);
         }
         let num = if len < 1000 {
             1
@@ -600,7 +599,7 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
         // println!("vec as op input");
         let len = self.len();
         if len == 0 {
-            return (vec![], len)
+            return (vec![], len);
         }
         let num = if len < 1000 {
             1
diff --git a/src/array/operations/bitwise.rs b/src/array/operations/bitwise.rs
index 7e78d679..8bae2eda 100644
--- a/src/array/operations/bitwise.rs
+++ b/src/array/operations/bitwise.rs
@@ -13,7 +13,7 @@ pub trait ElementBitWiseOps:
 {
 }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // impl<T> ElementBitWiseOps for T where
 //     T: std::ops::BitAndAssign + std::ops::BitOrAssign + std::ops::BitXorAssign + Dist //+ AmDist
 // {
diff --git a/src/array/operations/handle.rs b/src/array/operations/handle.rs
index 855cbe89..27f01e3d 100644
--- a/src/array/operations/handle.rs
+++ b/src/array/operations/handle.rs
@@ -13,11 +13,13 @@ use std::{
 
 use pin_project::pin_project;
 
+/// a task handle for a batched array operation that doesnt return any values
 pub struct ArrayBatchOpHandle {
     pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<(AmHandle<()>, Vec<usize>)>,
 }
 
+/// a task handle for a single array operation that doesnt return any values
 pub type ArrayOpHandle = ArrayBatchOpHandle;
 
 impl LamellarRequest for ArrayBatchOpHandle {
@@ -53,13 +55,14 @@ impl Future for ArrayBatchOpHandle {
     }
 }
 
+/// a task handle for a single array operation that returns a value
 pub struct ArrayFetchOpHandle<R: AmDist> {
     pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: AmHandle<Vec<R>>,
 }
 
 impl<R: AmDist> LamellarRequest for ArrayFetchOpHandle<R> {
-    fn blocking_wait(mut self) -> Self::Output {
+    fn blocking_wait(self) -> Self::Output {
         self.req
             .blocking_wait()
             .pop()
@@ -83,6 +86,7 @@ impl<R: AmDist> Future for ArrayFetchOpHandle<R> {
     }
 }
 
+/// a task handle for a batched array operation that return values
 #[pin_project]
 pub struct ArrayFetchBatchOpHandle<R: AmDist> {
     pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
@@ -168,6 +172,7 @@ impl<R: AmDist> Future for ArrayFetchBatchOpHandle<R> {
     }
 }
 
+/// a task handle for a single array operation that returns a result
 pub struct ArrayResultOpHandle<R: AmDist> {
     pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: AmHandle<Vec<Result<R, R>>>,
@@ -198,6 +203,7 @@ impl<R: AmDist> Future for ArrayResultOpHandle<R> {
     }
 }
 
+/// a task handle for a batched array operation that returns results
 #[pin_project]
 pub struct ArrayResultBatchOpHandle<R: AmDist> {
     pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index be61a1a0..d60673c7 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -8,7 +8,7 @@ use super::handle::{
 /// - Right ```>>```
 pub trait ElementShiftOps: std::ops::ShlAssign + std::ops::ShrAssign + Dist + Sized {}
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // impl<T> ElementShiftOps for T where T: std::ops::ShlAssign + std::ops::ShrAssign + Dist //+ AmDist,,,
 // {
 // }
diff --git a/src/array/prelude.rs b/src/array/prelude.rs
index 53e281a5..13075df6 100644
--- a/src/array/prelude.rs
+++ b/src/array/prelude.rs
@@ -5,7 +5,7 @@ pub use crate::array::local_lock_atomic::LocalLockArray;
 pub use crate::array::native_atomic::NativeAtomicArray;
 pub use crate::array::r#unsafe::UnsafeArray;
 pub use crate::array::read_only::ReadOnlyArray;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use crate::array::{
     register_reduction,
     ArrayOps,
@@ -36,11 +36,11 @@ pub use crate::array::operations::{
 };
 // pub use crate::array::operations::*;
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use crate::active_messaging::ActiveMessaging;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use crate::lamellar_team::LamellarTeamRT;
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 81659d0d..9c58a99f 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -1,5 +1,6 @@
 mod iteration;
-mod local_chunks;
+pub(crate) mod local_chunks;
+pub use local_chunks::ReadOnlyLocalChunks;
 mod rdma;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
@@ -40,19 +41,19 @@ use std::sync::Arc;
 
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub struct ReadOnlyArrayOpBuf {
 //     pub id: TypeId,
 //     pub op: BufFn,
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub struct ReadOnlyArrayMultiMultiOps {
 //     pub id: TypeId,
 //     pub op: MultiMultiFn,
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub struct ReadOnlyArrayMultiSingleOps {
 //     pub id: TypeId,
 //     pub op: MultiSingleFn,
@@ -335,9 +336,9 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
         self.array.into()
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-        self.array.async_barrier()
-    }
+    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    //     self.array.async_barrier()
+    // }
 }
 
 impl<T: Dist + 'static> ReadOnlyArray<T> {
@@ -489,22 +490,139 @@ impl<T: Dist> From<LamellarByteArray> for ReadOnlyArray<T> {
 }
 
 impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
+    /// assert_eq!(array.len()*num_pes,sum);
+    ///```
     pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let sum = array.block_on(array.sum());
+    /// assert_eq!(array.len()*num_pes,sum);
+    /// ```
     pub fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
+    ///     elem.store(i+1);
+    /// });
+    /// array.wait_all();
+    /// array.barrier();
+    /// let prod =  array.block_on(array.prod());
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
     pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let max = array.block_on(array.max());
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
     pub fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let min = array.block_on(array.min());
+    /// assert_eq!(0,min);
+    ///```
     pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
diff --git a/src/array/read_only/local_chunks.rs b/src/array/read_only/local_chunks.rs
index a1cb7444..3f3b098a 100644
--- a/src/array/read_only/local_chunks.rs
+++ b/src/array/read_only/local_chunks.rs
@@ -4,6 +4,8 @@ use crate::array::read_only::*;
 use crate::array::LamellarArray;
 use crate::memregion::Dist;
 
+/// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of an [ReadOnlyArray]
+/// This struct is created by calling [ReadOnlyArray::local_chunks]
 #[derive(Clone)]
 pub struct ReadOnlyLocalChunks<T: Dist> {
     chunk_size: usize,
@@ -87,6 +89,23 @@ impl<T: Dist + 'static> IndexedLocalIterator for ReadOnlyLocalChunks<T> {
 }
 
 impl<T: Dist> ReadOnlyArray<T> {
+    /// immutably iterate over fixed sized chunks(slices) of the local data of this array.
+    /// the returned iterator is a lamellar [LocalIterator]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    ///
+    /// array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    /// });
+    /// array.wait_all();
+    ///
+    /// ```
     pub fn local_chunks(&self, chunk_size: usize) -> ReadOnlyLocalChunks<T> {
         ReadOnlyLocalChunks {
             chunk_size,
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index bb182938..846d7d09 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1,6 +1,7 @@
 mod iteration;
 
-mod local_chunks;
+pub(crate) mod local_chunks;
+// pub use local_chunks::{};
 pub(crate) mod operations;
 mod rdma;
 
@@ -166,7 +167,7 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         //     AllocationType::Global,
         // );
         // println!("new array {:?}",rmr_t.as_ptr());
-        
+
         unsafe {
             // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
             //     *elem = std::mem::zeroed();
@@ -176,11 +177,17 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
                 // case one of the intermediate drops does a panic.
                 // slice.iter_mut().for_each(write_zeroes);
                 panic!("need drop not yet supported");
-              } else {
+            } else {
                 // Otherwise we can be really fast and just fill everthing with zeros.
-                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
-                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
-              }
+                let len = std::mem::size_of_val::<[T]>(
+                    rmr_t.as_mut_slice().expect("data should exist on pe"),
+                );
+                std::ptr::write_bytes(
+                    rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8,
+                    0u8,
+                    len,
+                )
+            }
         }
         let rmr = unsafe { rmr_t.to_base::<u8>() };
         // println!("new array u8 {:?}",rmr.as_ptr());
@@ -258,9 +265,9 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         //     team.lamellae.clone(),
         //     AllocationType::Global,
         // );
-        
+
         unsafe {
-             // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
+            // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
             //     *elem = std::mem::zeroed();
             // }
             if std::mem::needs_drop::<T>() {
@@ -268,11 +275,17 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
                 // case one of the intermediate drops does a panic.
                 // slice.iter_mut().for_each(write_zeroes);
                 panic!("need drop not yet supported");
-              } else {
+            } else {
                 // Otherwise we can be really fast and just fill everthing with zeros.
-                let len = std::mem::size_of_val::<[T]>(rmr_t.as_mut_slice().expect("data should exist on pe"));
-                unsafe { std::ptr::write_bytes(rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8, 0u8, len) }
-              }
+                let len = std::mem::size_of_val::<[T]>(
+                    rmr_t.as_mut_slice().expect("data should exist on pe"),
+                );
+                std::ptr::write_bytes(
+                    rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8,
+                    0u8,
+                    len,
+                )
+            }
         }
         let rmr = unsafe { rmr_t.to_base::<u8>() };
 
@@ -685,7 +698,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.inner.data.team.tasking_barrier();
     }
 
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
         self.inner.data.team.async_barrier()
     }
 }
@@ -1353,7 +1366,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
 }
 
 impl UnsafeArrayInnerWeak {
-    pub fn upgrade(&self) -> Option<UnsafeArrayInner> {
+    pub(crate) fn upgrade(&self) -> Option<UnsafeArrayInner> {
         if let Some(data) = self.data.upgrade() {
             Some(UnsafeArrayInner {
                 data: data,
@@ -1455,7 +1468,7 @@ impl UnsafeArrayInner {
 
     //index relative to subarray, return offset relative to subarray
     // //#[tracing::instrument(skip_all)]
-    pub fn pe_full_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
+    pub(crate) fn pe_full_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
         let mut global_index = self.offset + index;
 
         match self.distribution {
@@ -1486,7 +1499,7 @@ impl UnsafeArrayInner {
     }
 
     //index relative to subarray, return offset relative to subarray
-    pub fn pe_sub_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
+    pub(crate) fn pe_sub_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
         let offset = self.pe_full_offset_for_dist_index(pe, index)?;
         match self.distribution {
             Distribution::Block => {
@@ -1510,7 +1523,7 @@ impl UnsafeArrayInner {
     //index is local with respect to subarray
     //returns local offset relative to full array
     // //#[tracing::instrument(skip_all)]
-    pub fn pe_full_offset_for_local_index(&self, pe: usize, index: usize) -> Option<usize> {
+    pub(crate) fn pe_full_offset_for_local_index(&self, pe: usize, index: usize) -> Option<usize> {
         let global_index = self.global_index_from_local(index)?;
         match self.distribution {
             Distribution::Block => {
@@ -1650,7 +1663,7 @@ impl UnsafeArrayInner {
     pub(crate) fn global_start_index_for_pe(&self, pe: usize) -> usize {
         match self.distribution {
             Distribution::Block => {
-                let mut global_start = self.orig_elem_per_pe * pe;
+                let global_start = self.orig_elem_per_pe * pe;
                 global_start + std::cmp::min(pe, self.orig_remaining_elems)
             }
             Distribution::Cyclic => pe,
@@ -1706,6 +1719,7 @@ impl UnsafeArrayInner {
         }
     }
 
+    #[allow(dead_code)]
     pub(crate) fn global_end_index_for_pe(&self, pe: usize) -> usize {
         self.global_start_index_for_pe(pe) + self.num_elems_pe(pe)
     }
diff --git a/src/array/unsafe/local_chunks.rs b/src/array/unsafe/local_chunks.rs
index 46d7605e..d65a8a4f 100644
--- a/src/array/unsafe/local_chunks.rs
+++ b/src/array/unsafe/local_chunks.rs
@@ -4,7 +4,8 @@ use crate::array::r#unsafe::*;
 use crate::array::LamellarArray;
 use crate::memregion::Dist;
 
-
+/// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of an [UnsafeArray]
+/// This struct is created by calling [UnsafeArray::local_chunks]
 #[derive(Clone)]
 pub struct UnsafeLocalChunks<T: Dist> {
     chunk_size: usize,
@@ -24,6 +25,8 @@ impl<T: Dist> IterClone for UnsafeLocalChunks<T> {
     }
 }
 
+/// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of an [UnsafeArray]
+/// This struct is created by calling [UnsafeArray::local_chunks_mut]
 #[derive(Clone)]
 pub struct UnsafeLocalChunksMut<T: Dist> {
     chunk_size: usize,
@@ -43,10 +46,8 @@ impl<T: Dist> IterClone for UnsafeLocalChunksMut<T> {
     }
 }
 
-
-
 impl<T: Dist + 'static> LocalIterator for UnsafeLocalChunks<T> {
-    type Item =  &'static [T];
+    type Item = &'static [T];
     type Array = UnsafeArray<T>;
     fn init(&self, start_i: usize, cnt: usize) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
@@ -79,10 +80,12 @@ impl<T: Dist + 'static> LocalIterator for UnsafeLocalChunks<T> {
             //     "start_i {} end_i {} self.index {} self.end_index {}",
             //     start_i, end_i, self.index, self.end_index
             // );
-            Some(unsafe{std::slice::from_raw_parts_mut(
-                self.array.local_as_mut_ptr().offset(start_i as isize),
-                end_i - start_i,
-            )})
+            Some(unsafe {
+                std::slice::from_raw_parts_mut(
+                    self.array.local_as_mut_ptr().offset(start_i as isize),
+                    end_i - start_i,
+                )
+            })
         } else {
             None
         }
@@ -106,10 +109,8 @@ impl<T: Dist + 'static> IndexedLocalIterator for UnsafeLocalChunks<T> {
     }
 }
 
-
-
 impl<T: Dist + 'static> LocalIterator for UnsafeLocalChunksMut<T> {
-    type Item =  &'static mut [T];
+    type Item = &'static mut [T];
     type Array = UnsafeArray<T>;
     fn init(&self, start_i: usize, cnt: usize) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
@@ -142,10 +143,12 @@ impl<T: Dist + 'static> LocalIterator for UnsafeLocalChunksMut<T> {
             //     "start_i {} end_i {} self.index {} self.end_index {}",
             //     start_i, end_i, self.index, self.end_index
             // );
-            Some(unsafe{std::slice::from_raw_parts_mut(
-                self.array.local_as_mut_ptr().offset(start_i as isize),
-                end_i - start_i,
-            )})
+            Some(unsafe {
+                std::slice::from_raw_parts_mut(
+                    self.array.local_as_mut_ptr().offset(start_i as isize),
+                    end_i - start_i,
+                )
+            })
         } else {
             None
         }
@@ -170,21 +173,55 @@ impl<T: Dist + 'static> IndexedLocalIterator for UnsafeLocalChunksMut<T> {
 }
 
 impl<T: Dist> UnsafeArray<T> {
+    /// immutably iterate over fixed sized chunks(slices) of the local data of this array.
+    /// the returned iterator is a lamellar [LocalIterator]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    ///
+    /// array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    /// });
+    /// array.wait_all();
+    ///
+    /// ```
     pub fn local_chunks(&self, chunk_size: usize) -> UnsafeLocalChunks<T> {
         UnsafeLocalChunks {
             chunk_size,
             index: 0,
             end_index: 0,
-            array: self.clone()
+            array: self.clone(),
         }
     }
 
+    /// mutably iterate over fixed sized chunks(slices) of the local data of this array.
+    /// the returned iterator is a lamellar [LocalIterator]
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    ///
+    /// array.local_chunks_mut(5).await.enumerate().for_each(move|(i,chunk)| {
+    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    /// });
+    /// array.wait_all();
+    ///
+    /// ```
     pub fn local_chunks_mut(&self, chunk_size: usize) -> UnsafeLocalChunksMut<T> {
         UnsafeLocalChunksMut {
             chunk_size,
             index: 0,
             end_index: 0,
-            array: self.clone()
+            array: self.clone(),
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 515cfaa1..b7e9434b 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -8,7 +8,7 @@ use crate::AmHandle;
 use parking_lot::Mutex;
 use std::any::TypeId;
 use std::collections::{HashMap, VecDeque};
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
 type MultiValMultiIdxFn = fn(LamellarByteArray, ArrayOpCmd<Vec<u8>>, Vec<u8>, u8) -> LamellarArcAm;
@@ -84,6 +84,7 @@ impl IndexSize {
             IndexSize::Usize => 8,
         }
     }
+    #[allow(dead_code)]
     fn as_bytes(&self, val: &usize) -> &[u8] {
         match self {
             IndexSize::U8 => unsafe {
@@ -586,7 +587,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         // We need this loop so that we ensure all the internal AMs have launched so calls like wait_all work properly
         while cnt.load(Ordering::SeqCst) < num_reqs {
             self.inner.data.team.scheduler.exec_task();
-        };
+        }
         let res = std::mem::take(&mut *futures.lock());
         res
     }
diff --git a/src/darc.rs b/src/darc.rs
index b2614497..ce768f70 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1,5 +1,5 @@
-//! Distributed Atomic Reference Counter-- a distributed extension of an [`Arc`][std::sync::Arc] called a [Darc][crate::darc].
-//! The atomic reference counter, [`Arc`][std::sync::Arc], is a backbone of safe
+//! Distributed Atomic Reference Counter-- a distributed extension of an [`Arc`] called a [Darc][crate::darc].
+//! The atomic reference counter, [`Arc`], is a backbone of safe
 //! concurrent programming in Rust, and, in particular, *shared ownership*.
 //!
 //! The `Darc` provides a similar abstraction within a *distributed* environment.
@@ -67,7 +67,7 @@ use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
-#[doc(hidden)]
+/// prelude for the darc module
 pub mod prelude;
 
 pub(crate) mod local_rw_darc;
@@ -144,7 +144,7 @@ unsafe impl<T> Sync for DarcInner<T> {} //we cant create DarcInners without goin
 
 /// Distributed atomic reference counter
 ///
-/// The atomic reference counter, [`Arc`][std::sync::Arc], is a backbone of safe
+/// The atomic reference counter, [`Arc`], is a backbone of safe
 /// concurrent programming in Rust, and, in particular, *shared ownership*.
 ///
 /// The `Darc` provides a similar abstraction within a *distributed* environment.
@@ -236,7 +236,53 @@ impl<'de, T: 'static> Deserialize<'de> for Darc<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
+/// `WeakDarc`` is a version of Darc that holds a non-owning reference to the managed object.
+/// (similar to [`Weak`](std::sync::Weak)).
+/// The managed object can be accessed by calling [`upgrade`](WeakDarc::upgrade), wich returns and ``Option<Darc<T>>``
+///
+/// A `WeakDarc` does not count toward ownership, thus it will not prevent the value stored in the allocation from being dropped,
+/// and it makes no guarantees itself about the value still being present, and thus can return `None` from `upgrade()`.
+/// Note that a `WeakDarc` does prevent the allocation itself from being deallocated.
+///
+/// The typical way to obtian a `WeakDarc` is to call [`Darc::downgrade`](Darc::downgrade).
+///
+/// # Examples
+///```
+/// use lamellar::active_messaging::prelude::*;
+/// use lamellar::darc::prelude::*;
+/// use std::sync::atomic::{AtomicUsize, Ordering};
+/// use std::sync::Arc;
+///
+/// #[lamellar::AmData(Clone)]
+/// struct DarcAm {
+///     counter: Darc<AtomicUsize>, //each pe has a local atomicusize
+/// }
+///
+/// #[lamellar::am]
+/// impl LamellarAm for DarcAm {
+///     async fn exec(self) {
+///         self.counter.fetch_add(1, Ordering::SeqCst); //this only updates atomic on the executing pe
+///     }
+///  }
+///
+/// fn main(){
+///     let world = LamellarWorldBuilder::new().build();
+///     let my_pe = world.my_pe();
+///     let num_pes = world.num_pes();
+///     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).unwrap();
+///     let weak = darc_counter.downgrade();
+///     match weak.upgrade(){
+///         Some(counter) => {
+///             counter.fetch_add(my_pe, Ordering::SeqCst);
+///         }
+///         None => {
+///             println!("counter is gone");
+///         }   
+///     }
+/// }
+///```
+///
 #[derive(Debug)]
 pub struct WeakDarc<T: 'static> {
     inner: *mut DarcInner<T>,
@@ -246,6 +292,8 @@ unsafe impl<T: Send> Send for WeakDarc<T> {}
 unsafe impl<T: Sync> Sync for WeakDarc<T> {}
 
 impl<T> WeakDarc<T> {
+    /// attempts to upgrade the `WeakDarc` to a [Darc], if the inner value has not been dropped
+    /// returns `None` if the value has been dropped
     pub fn upgrade(&self) -> Option<Darc<T>> {
         let inner = unsafe { &*self.inner };
         inner.local_cnt.fetch_add(1, Ordering::SeqCst);
@@ -810,7 +858,8 @@ impl<T: 'static> fmt::Debug for DarcInner<T> {
 }
 
 impl<T> Darc<T> {
-    #[doc(hidden)]
+    //#[doc(hidden)]
+    /// downgrade a darc to a weak darc
     pub fn downgrade(the_darc: &Darc<T>) -> WeakDarc<T> {
         // println!("downgrading darc ");
         // the_darc.print();
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index ba3ec400..1c3e7aed 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -743,7 +743,7 @@ impl<T> GlobalRwDarc<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
     /// Once aqui
-    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_read]
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::read]
     ///
     /// # Examples
     ///```
@@ -797,7 +797,7 @@ impl<T> GlobalRwDarc<T> {
     /// Once aquired the lock will only be held by the calling PE (until it is dropped)
     ///
     /// # Note
-    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::async_write]
+    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::write]
     ///
     /// # Examples
     ///```
@@ -1055,7 +1055,7 @@ impl<T: fmt::Display> fmt::Display for GlobalRwDarc<T> {
     }
 }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub fn globalrw_serialize<S, T>(localrw: &GlobalRwDarc<T>, s: S) -> Result<S::Ok, S::Error>
 // where
 //     S: Serializer,
@@ -1063,7 +1063,7 @@ impl<T: fmt::Display> fmt::Display for GlobalRwDarc<T> {
 //     __NetworkDarc::<T>::from(&localrw.darc).serialize(s)
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub fn globalrw_from_ndarc<'de, D, T>(deserializer: D) -> Result<GlobalRwDarc<T>, D::Error>
 // where
 //     D: Deserializer<'de>,
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index f6b4c9e3..978ada91 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -489,7 +489,7 @@ impl<T: fmt::Display + Sync + Send> fmt::Display for LocalRwDarc<T> {
     }
 }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub fn localrw_serialize<S, T>(localrw: &LocalRwDarc<T>, s: S) -> Result<S::Ok, S::Error>
 // where
 //     S: Serializer,
@@ -497,7 +497,7 @@ impl<T: fmt::Display + Sync + Send> fmt::Display for LocalRwDarc<T> {
 //     __NetworkDarc::<T>::from(&localrw.darc).serialize(s)
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub fn localrw_from_ndarc<'de, D, T>(deserializer: D) -> Result<LocalRwDarc<T>, D::Error>
 // where
 //     D: Deserializer<'de>,
@@ -512,8 +512,11 @@ impl<T: fmt::Display + Sync + Send> fmt::Display for LocalRwDarc<T> {
 //     Ok(rwdarc)
 // }
 
-#[doc(hidden)]
-pub fn localrw_serialize2<S, T>(localrw: &Darc<Arc<RwLock<T>>>, s: S) -> Result<S::Ok, S::Error>
+//#[doc(hidden)]
+pub(crate) fn localrw_serialize2<S, T>(
+    localrw: &Darc<Arc<RwLock<T>>>,
+    s: S,
+) -> Result<S::Ok, S::Error>
 where
     S: Serializer,
 {
@@ -523,8 +526,11 @@ where
     ndarc.serialize(s)
 }
 
-#[doc(hidden)]
-pub fn localrw_from_ndarc2<'de, D, T>(deserializer: D) -> Result<Darc<Arc<RwLock<T>>>, D::Error>
+//#[doc(hidden)]
+// #[allow(unreachable_pub)]
+pub(crate) fn localrw_from_ndarc2<'de, D, T>(
+    deserializer: D,
+) -> Result<Darc<Arc<RwLock<T>>>, D::Error>
 where
     D: Deserializer<'de>,
 {
diff --git a/src/darc/prelude.rs b/src/darc/prelude.rs
index 107c3092..e9ca88fa 100644
--- a/src/darc/prelude.rs
+++ b/src/darc/prelude.rs
@@ -1,15 +1,15 @@
 pub use crate::darc::global_rw_darc::GlobalRwDarc;
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub use crate::darc::global_rw_darc::{globalrw_from_ndarc, globalrw_serialize};
 pub use crate::darc::local_rw_darc::LocalRwDarc;
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub use crate::darc::local_rw_darc::{localrw_from_ndarc, localrw_serialize};
 pub use crate::darc::Darc;
 
 pub use crate::active_messaging::ActiveMessaging;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use crate::lamellar_team::LamellarTeamRT;
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
diff --git a/src/env_var.rs b/src/env_var.rs
index 640e80ab..dc7235be 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -6,7 +6,7 @@ fn default_deadlock_timeout() -> f64 {
     600.0
 }
 
-fn default_op_batch() -> usize {
+fn default_am_group_batch_size() -> usize {
     10000
 }
 
@@ -47,7 +47,7 @@ pub enum HeapMode {
 }
 
 fn default_heap_mode() -> HeapMode {
-    HeapMode::Static
+    HeapMode::Dynamic
 }
 
 #[derive(Deserialize, Debug, PartialEq)]
@@ -85,23 +85,38 @@ fn default_batch_am_size() -> usize {
 
 #[derive(Deserialize, Debug)]
 pub struct Config {
+    /// A general timeout in seconds for various operations which may indicate a deadlock, default: 600.0 seconds
     #[serde(default = "default_deadlock_timeout")]
     pub deadlock_timeout: f64,
-    #[serde(default = "default_op_batch")]
-    pub batch_op_size: usize, // am group batch size
+
+    /// The maximum number of sub messages that will be sent in a single AMGroup Active Message, default: 10000
+    #[serde(default = "default_am_group_batch_size")]
+    pub am_group_batch_size: usize, // am group batch size
+
+    /// The dissemination factor for the n-way barrier, default: 2
     #[serde(default = "default_dissemination_factor")]
     pub barrier_dissemination_factor: usize,
-    // #[serde(default=true)]
+
+    /// flag used to print warnings when users call barriers on worker threads. Default: true
     pub barrier_warning: Option<bool>,
+
+    /// The lamellae backend to use
+    /// rofi -- multi pe distributed execution, default if rofi feature is turned on
+    /// local -- single pe execution, default if rofi feature is turned off
+    /// shmem -- multi pe single node execution
     #[serde(default = "default_backend")]
     pub backend: String, //rofi,shmem,local
+
+    /// The executor (thread scheduler) to use, default: 'lamellar' unless the tokio feature is turned on
     #[serde(default = "default_executor")]
     pub executor: String, //lamellar,tokio,async_std
+
+    /// The batcher to use, default: 'simple'
     #[serde(default = "default_batcher")]
     pub batcher: String,
     #[serde(default = "default_threads")]
     pub threads: usize,
-    pub batch_op_threads: Option<usize>,//number of threads used to process array batch ops sending
+    pub batch_op_threads: Option<usize>, //number of threads used to process array batch ops sending
     pub heap_size: Option<usize>,
     #[serde(default = "default_heap_mode")]
     pub heap_mode: HeapMode,
@@ -117,13 +132,11 @@ pub struct Config {
     pub batch_am_size: usize, //the threshold for an activemessage (in bytes) on whether it will be sent directly or aggregated
 }
 
+/// Get the current Environment Variable configuration
 pub fn config() -> &'static Config {
     static CONFIG: OnceLock<Config> = OnceLock::new();
     CONFIG.get_or_init(|| match envy::prefixed("LAMELLAR_").from_env::<Config>() {
-        Ok(config) => {
-            // println!("[LAMELLAR CONFIG]{config:?}");
-            config
-        }
+        Ok(config) => config,
         Err(error) => panic!("{}", error),
     })
 }
diff --git a/src/lamellae.rs b/src/lamellae.rs
index 2c7e249c..e8b70a95 100755
--- a/src/lamellae.rs
+++ b/src/lamellae.rs
@@ -39,12 +39,12 @@ lazy_static! {
     serde::Serialize, serde::Deserialize, Debug, PartialEq, Eq, Ord, PartialOrd, Hash, Clone, Copy,
 )]
 pub enum Backend {
-    #[doc(hidden)]
     #[cfg(feature = "enable-rofi")]
+    /// The Rofi (Rust-OFI) backend -- intended for multi process and distributed environments
     Rofi,
-    #[doc(hidden)]
+    /// The Local backend -- intended for single process environments
     Local,
-    #[doc(hidden)]
+    /// The Shmem backend -- intended for multi process environments single node environments
     Shmem,
 }
 
@@ -151,11 +151,11 @@ pub(crate) trait LamellaeInit {
 // #[async_trait]
 #[enum_dispatch]
 pub(crate) trait Ser {
-    fn serialize<T: serde::Serialize + ?Sized>(
-        &self,
-        header: Option<SerializeHeader>,
-        obj: &T,
-    ) -> Result<SerializedData, anyhow::Error>;
+    // fn serialize<T: serde::Serialize + ?Sized>(
+    //     &self,
+    //     header: Option<SerializeHeader>,
+    //     obj: &T,
+    // ) -> Result<SerializedData, anyhow::Error>;
     fn serialize_header(
         &self,
         header: Option<SerializeHeader>,
@@ -182,7 +182,7 @@ pub(crate) trait LamellaeComm: LamellaeAM + LamellaeRDMA {
     fn backend(&self) -> Backend;
     #[allow(non_snake_case)]
     fn MB_sent(&self) -> f64;
-    fn print_stats(&self);
+    // fn print_stats(&self);
     fn shutdown(&self);
     fn force_shutdown(&self);
     fn force_deinit(&self);
@@ -191,7 +191,6 @@ pub(crate) trait LamellaeComm: LamellaeAM + LamellaeRDMA {
 #[async_trait]
 #[enum_dispatch]
 pub(crate) trait LamellaeAM: Send {
-    async fn send_to_pe_async(&self, pe: usize, data: SerializedData); //should never send to self... this is short circuited before request is serialized in the active message layer
     async fn send_to_pes_async(
         &self,
         pe: Option<usize>,
@@ -209,15 +208,15 @@ pub(crate) trait LamellaeRDMA: Send + Sync {
     fn get(&self, pe: usize, src: usize, dst: &mut [u8]);
     fn iget(&self, pe: usize, src: usize, dst: &mut [u8]);
     fn rt_alloc(&self, size: usize, align: usize) -> AllocResult<usize>;
-    fn rt_check_alloc(&self, size: usize, align: usize) -> bool;
+    // fn rt_check_alloc(&self, size: usize, align: usize) -> bool;
     fn rt_free(&self, addr: usize);
     fn alloc(&self, size: usize, alloc: AllocationType, align: usize) -> AllocResult<usize>;
     fn free(&self, addr: usize);
     fn base_addr(&self) -> usize;
     fn local_addr(&self, remote_pe: usize, remote_addr: usize) -> usize;
     fn remote_addr(&self, remote_pe: usize, local_addr: usize) -> usize;
-    fn occupied(&self) -> usize;
-    fn num_pool_allocs(&self) -> usize;
+    // fn occupied(&self) -> usize;
+    // fn num_pool_allocs(&self) -> usize;
     fn alloc_pool(&self, min_size: usize);
 }
 
diff --git a/src/lamellae/comm.rs b/src/lamellae/comm.rs
index 73de7fdf..70bbdb84 100644
--- a/src/lamellae/comm.rs
+++ b/src/lamellae/comm.rs
@@ -28,7 +28,7 @@ pub(crate) enum CmdQStatus {
 // const PANIC: u8 = 3;
 
 #[derive(Debug, Clone, Copy)]
-pub enum AllocError {
+pub(crate) enum AllocError {
     OutOfMemoryError(usize),
     IdError(usize),
 }
@@ -52,7 +52,7 @@ pub(crate) type AllocResult<T> = Result<T, AllocError>;
 
 #[cfg(feature = "enable-rofi")]
 #[derive(Debug, Clone, Copy)]
-pub enum TxError {
+pub(crate) enum TxError {
     GetError,
 }
 #[cfg(feature = "enable-rofi")]
@@ -118,7 +118,7 @@ pub(crate) trait CommOps {
     fn put_all<T: Remote>(&self, src_addr: &[T], dst_addr: usize);
     fn get<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]);
     fn iget<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]);
-    fn iget_relative<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]);
+    // fn iget_relative<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]);
     #[allow(non_snake_case)]
     fn MB_sent(&self) -> f64;
     fn force_shutdown(&self);
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index ac32e1f6..e3817d95 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -17,7 +17,7 @@ use std::sync::Arc;
 
 // const CMD_BUF_LEN: usize = 50000; // this is the number of slots for each PE
 // const NUM_REQ_SLOTS: usize = CMD_Q_LEN; // max requests at any given time -- probably have this be a multiple of num PES
-const CMD_BUFS_PER_PE: usize = 2;
+// const CMD_BUFS_PER_PE: usize = 2;
 
 // lazy_static! {
 //     static ref CNTS: ThreadLocal<AtomicUsize> = ThreadLocal::new();
@@ -841,7 +841,7 @@ impl InnerCQ {
                 cmd.calc_hash();
                 for pe in 0..self.num_pes {
                     if pe != self.my_pe {
-                        println!("putting alloc cmd to pe {:?}", pe);
+                        // println!("putting alloc cmd to pe {:?}", pe);
                         self.comm.put(pe, cmd.as_bytes(), cmd.as_addr());
                     }
                 }
@@ -869,7 +869,7 @@ impl InnerCQ {
             cmd.calc_hash();
             for pe in 0..self.num_pes {
                 if pe != self.my_pe {
-                    println!("putting clear cmd to pe {:?}", pe);
+                    // println!("putting clear cmd to pe {:?}", pe);
                     self.comm.put(pe, cmd.as_bytes(), cmd.as_addr());
                 }
             }
@@ -1200,7 +1200,7 @@ pub(crate) struct CommandQueue {
 
 impl CommandQueue {
     //#[tracing::instrument(skip_all)]
-    pub fn new(
+    pub(crate) fn new(
         comm: Arc<Comm>,
         my_pe: usize,
         num_pes: usize,
@@ -1315,17 +1315,17 @@ impl CommandQueue {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn send_alloc(&self, min_size: usize) {
+    pub(crate) fn send_alloc(&self, min_size: usize) {
         self.cq.send_alloc(min_size)
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn send_panic(&self) {
+    pub(crate) fn send_panic(&self) {
         self.cq.send_panic()
     }
 
     //#[tracing::instrument(skip_all)]
-    pub async fn send_data(&self, data: SerializedData, dst: usize) {
+    pub(crate) async fn send_data(&self, data: SerializedData, dst: usize) {
         match data {
             #[cfg(feature = "enable-rofi")]
             SerializedData::RofiData(ref data) => {
@@ -1361,7 +1361,7 @@ impl CommandQueue {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub async fn alloc_task(&self, scheduler: Arc<Scheduler>) {
+    pub(crate) async fn alloc_task(&self, scheduler: Arc<Scheduler>) {
         while scheduler.active() && self.active.load(Ordering::SeqCst) != CmdQStatus::Panic as u8 {
             self.cq.check_alloc();
             async_std::task::yield_now().await;
@@ -1370,7 +1370,7 @@ impl CommandQueue {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub async fn panic_task(&self, scheduler: Arc<Scheduler>) {
+    pub(crate) async fn panic_task(&self, scheduler: Arc<Scheduler>) {
         let mut panic = false;
         while scheduler.active() && !panic {
             panic = self.cq.check_panic();
@@ -1386,7 +1386,7 @@ impl CommandQueue {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub async fn recv_data(&self, scheduler: Arc<Scheduler>, lamellae: Arc<Lamellae>) {
+    pub(crate) async fn recv_data(&self, scheduler: Arc<Scheduler>, lamellae: Arc<Lamellae>) {
         let num_pes = lamellae.num_pes();
         let my_pe = lamellae.my_pe();
         // let mut timer= std::time::Instant::now();
@@ -1524,13 +1524,13 @@ impl CommandQueue {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn tx_amount(&self) -> usize {
+    pub(crate) fn tx_amount(&self) -> usize {
         // println!("cq put: {:?} get {:?}",self.cq.put_amt.load(Ordering::SeqCst) ,self.cq.get_amt.load(Ordering::SeqCst));
         self.cq.put_amt.load(Ordering::SeqCst) + self.cq.get_amt.load(Ordering::SeqCst)
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn mem_per_pe() -> usize {
+    pub(crate) fn mem_per_pe() -> usize {
         (config().cmd_buf_len * config().cmd_buf_cnt + 4) * std::mem::size_of::<CmdMsg>()
     }
 }
diff --git a/src/lamellae/local_lamellae.rs b/src/lamellae/local_lamellae.rs
index 5c166cc8..cc84b62b 100644
--- a/src/lamellae/local_lamellae.rs
+++ b/src/lamellae/local_lamellae.rs
@@ -68,13 +68,13 @@ impl Local {
 
 // #[async_trait]
 impl Ser for Local {
-    fn serialize<T: serde::Serialize + ?Sized>(
-        &self,
-        _header: Option<SerializeHeader>,
-        _obj: &T,
-    ) -> Result<SerializedData, anyhow::Error> {
-        panic!("should not be serializing in local");
-    }
+    // fn serialize<T: serde::Serialize + ?Sized>(
+    //     &self,
+    //     _header: Option<SerializeHeader>,
+    //     _obj: &T,
+    // ) -> Result<SerializedData, anyhow::Error> {
+    //     panic!("should not be serializing in local");
+    // }
     fn serialize_header(
         &self,
         _header: Option<SerializeHeader>,
@@ -108,7 +108,7 @@ impl LamellaeComm for Local {
     fn MB_sent(&self) -> f64 {
         0.0f64
     }
-    fn print_stats(&self) {}
+    // fn print_stats(&self) {}
     fn shutdown(&self) {}
     fn force_shutdown(&self) {}
     fn force_deinit(&self) {}
@@ -116,7 +116,6 @@ impl LamellaeComm for Local {
 
 #[async_trait]
 impl LamellaeAM for Local {
-    async fn send_to_pe_async(&self, _pe: usize, _data: SerializedData) {}
     async fn send_to_pes_async(
         &self,
         _pe: Option<usize>,
@@ -178,9 +177,9 @@ impl LamellaeRDMA for Local {
         );
         Ok(data_addr)
     }
-    fn rt_check_alloc(&self, _size: usize, _align: usize) -> bool {
-        true
-    }
+    // fn rt_check_alloc(&self, _size: usize, _align: usize) -> bool {
+    //     true
+    // }
 
     fn rt_free(&self, addr: usize) {
         let mut allocs = self.allocs.lock();
@@ -226,12 +225,12 @@ impl LamellaeRDMA for Local {
         local_addr
     }
     //todo make this return a real value
-    fn occupied(&self) -> usize {
-        0
-    }
-    fn num_pool_allocs(&self) -> usize {
-        1
-    }
+    // fn occupied(&self) -> usize {
+    //     0
+    // }
+    // fn num_pool_allocs(&self) -> usize {
+    //     1
+    // }
     fn alloc_pool(&self, _min_size: usize) {}
 }
 
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index ff847f6c..616d4a93 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -571,52 +571,52 @@ impl CommOps for RofiComm {
 
     //src address is relative to rofi base addr
     //#[tracing::instrument(skip_all)]
-    fn iget_relative<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]) {
-        //-> RofiReq {
-        // let mut req = RofiReq{
-        //     txids: Vec::new(),
-        //     _drop_set: self.drop_set.clone(),
-        //     _any_dropped: self.any_dropped.clone(),
-        // };
-        if pe != self.my_pe {
-            // unsafe {
-            // let _lock = self.comm_mutex.write();
-            // println!("[{:?}]-({:?}) iget_relative [{:?}] entry",self.my_pe,thread::current().id(),pe);
-
-            match rofi_iget(*self.rofi_base_address.read() + src_addr, dst_addr, pe) {
-                //.expect("error in rofi get")
-                Err(_ret) => {
-                    println!(
-                        "[{:?}] Error in iget_relative from {:?} src_addr {:x} ({:x}) dst_addr {:?} base_addr {:x} size {:?}",
-                        self.my_pe,
-                        pe,
-                        src_addr,
-                        src_addr+*self.rofi_base_address.read() ,
-                        dst_addr.as_ptr(),
-                        *self.rofi_base_address.read(),
-                        dst_addr.len()
-                    );
-                    panic!();
-                }
-                Ok(_ret) => {
-                    self.get_cnt.fetch_add(1, Ordering::SeqCst);
-                    self.get_amt
-                        .fetch_add(dst_addr.len() * std::mem::size_of::<T>(), Ordering::SeqCst);
-                    // if ret != 0{
-                    //     req.txids.push(ret);
-                    // }
-                }
-            }
-            // };
-        } else {
-            unsafe {
-                std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
-            }
-        }
-        // req
-        // println!("[{:?}]- gc: {:?} pc: {:?} iget_relative exit",self.my_pe,self.get_cnt.load(Ordering::SeqCst),self.put_cnt.load(Ordering::SeqCst));
-        // println!("[{:?}]-({:?}) iget relative [{:?}] exit",self.my_pe,thread::current().id(),pe);
-    }
+    // fn iget_relative<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]) {
+    //     //-> RofiReq {
+    //     // let mut req = RofiReq{
+    //     //     txids: Vec::new(),
+    //     //     _drop_set: self.drop_set.clone(),
+    //     //     _any_dropped: self.any_dropped.clone(),
+    //     // };
+    //     if pe != self.my_pe {
+    //         // unsafe {
+    //         // let _lock = self.comm_mutex.write();
+    //         // println!("[{:?}]-({:?}) iget_relative [{:?}] entry",self.my_pe,thread::current().id(),pe);
+
+    //         match rofi_iget(*self.rofi_base_address.read() + src_addr, dst_addr, pe) {
+    //             //.expect("error in rofi get")
+    //             Err(_ret) => {
+    //                 println!(
+    //                     "[{:?}] Error in iget_relative from {:?} src_addr {:x} ({:x}) dst_addr {:?} base_addr {:x} size {:?}",
+    //                     self.my_pe,
+    //                     pe,
+    //                     src_addr,
+    //                     src_addr+*self.rofi_base_address.read() ,
+    //                     dst_addr.as_ptr(),
+    //                     *self.rofi_base_address.read(),
+    //                     dst_addr.len()
+    //                 );
+    //                 panic!();
+    //             }
+    //             Ok(_ret) => {
+    //                 self.get_cnt.fetch_add(1, Ordering::SeqCst);
+    //                 self.get_amt
+    //                     .fetch_add(dst_addr.len() * std::mem::size_of::<T>(), Ordering::SeqCst);
+    //                 // if ret != 0{
+    //                 //     req.txids.push(ret);
+    //                 // }
+    //             }
+    //         }
+    //         // };
+    //     } else {
+    //         unsafe {
+    //             std::ptr::copy(src_addr as *const T, dst_addr.as_mut_ptr(), dst_addr.len());
+    //         }
+    //     }
+    //     // req
+    //     // println!("[{:?}]- gc: {:?} pc: {:?} iget_relative exit",self.my_pe,self.get_cnt.load(Ordering::SeqCst),self.put_cnt.load(Ordering::SeqCst));
+    //     // println!("[{:?}]-({:?}) iget relative [{:?}] exit",self.my_pe,thread::current().id(),pe);
+    // }
 
     fn force_shutdown(&self) {
         let _res = rofi_finit();
@@ -671,7 +671,7 @@ pub(crate) struct RofiData {
 
 impl RofiData {
     //#[tracing::instrument(skip_all)]
-    pub fn new(rofi_comm: Arc<Comm>, size: usize) -> Result<RofiData, anyhow::Error> {
+    pub(crate) fn new(rofi_comm: Arc<Comm>, size: usize) -> Result<RofiData, anyhow::Error> {
         let ref_cnt_size = std::mem::size_of::<AtomicUsize>();
         let alloc_size = size + ref_cnt_size; //+  std::mem::size_of::<u64>();
         let relative_addr = rofi_comm.rt_alloc(alloc_size, std::mem::align_of::<AtomicUsize>())?;
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index 5f0bd012..71b21b6a 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -3,8 +3,8 @@ use crate::lamellae::comm::{AllocResult, CmdQStatus, CommOps};
 use crate::lamellae::command_queues::CommandQueue;
 use crate::lamellae::rofi::rofi_comm::{RofiComm, RofiData};
 use crate::lamellae::{
-    AllocationType, Backend, Comm, Des, Lamellae, LamellaeAM, LamellaeComm, LamellaeInit,
-    LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
+    AllocationType, Backend, Comm, Lamellae, LamellaeAM, LamellaeComm, LamellaeInit, LamellaeRDMA,
+    Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
 use crate::scheduler::Scheduler;
@@ -139,7 +139,7 @@ impl LamellaeComm for Rofi {
         self.rofi_comm.MB_sent()
         //+ self.cq.tx_amount() as f64 / 1_000_000.0
     }
-    fn print_stats(&self) {}
+    // fn print_stats(&self) {}
     fn shutdown(&self) {
         // println!("Rofi Lamellae shuting down");
         let _ = self.active.compare_exchange(
@@ -172,10 +172,6 @@ impl LamellaeComm for Rofi {
 
 #[async_trait]
 impl LamellaeAM for Rofi {
-    async fn send_to_pe_async(&self, pe: usize, data: SerializedData) {
-        self.cq.send_data(data, pe).await;
-    } //should never send to self... this is short circuited before request is serialized in the active message layer
-
     async fn send_to_pes_async(
         &self,
         pe: Option<usize>,
@@ -196,19 +192,19 @@ impl LamellaeAM for Rofi {
 }
 
 impl Ser for Rofi {
-    fn serialize<T: serde::Serialize + ?Sized>(
-        &self,
-        header: Option<SerializeHeader>,
-        obj: &T,
-    ) -> Result<SerializedData, anyhow::Error> {
-        let header_size = *SERIALIZE_HEADER_LEN;
-        // let data_size = bincode::serialized_size(obj)? as usize;
-        let data_size = crate::serialized_size(obj, true) as usize;
-        let ser_data = RofiData::new(self.rofi_comm.clone(), header_size + data_size)?;
-        crate::serialize_into(ser_data.header_as_bytes(), &header, false)?; //we want header to be a fixed size
-        crate::serialize_into(ser_data.data_as_bytes(), obj, true)?;
-        Ok(SerializedData::RofiData(ser_data))
-    }
+    // fn serialize<T: serde::Serialize + ?Sized>(
+    //     &self,
+    //     header: Option<SerializeHeader>,
+    //     obj: &T,
+    // ) -> Result<SerializedData, anyhow::Error> {
+    //     let header_size = *SERIALIZE_HEADER_LEN;
+    //     // let data_size = bincode::serialized_size(obj)? as usize;
+    //     let data_size = crate::serialized_size(obj, true) as usize;
+    //     let ser_data = RofiData::new(self.rofi_comm.clone(), header_size + data_size)?;
+    //     crate::serialize_into(ser_data.header_as_bytes(), &header, false)?; //we want header to be a fixed size
+    //     crate::serialize_into(ser_data.data_as_bytes(), obj, true)?;
+    //     Ok(SerializedData::RofiData(ser_data))
+    // }
     fn serialize_header(
         &self,
         header: Option<SerializeHeader>,
@@ -245,9 +241,9 @@ impl LamellaeRDMA for Rofi {
     fn rt_alloc(&self, size: usize, align: usize) -> AllocResult<usize> {
         self.rofi_comm.rt_alloc(size, align)
     }
-    fn rt_check_alloc(&self, size: usize, align: usize) -> bool {
-        self.rofi_comm.rt_check_alloc(size, align)
-    }
+    // fn rt_check_alloc(&self, size: usize, align: usize) -> bool {
+    //     self.rofi_comm.rt_check_alloc(size, align)
+    // }
     fn rt_free(&self, addr: usize) {
         self.rofi_comm.rt_free(addr)
     }
@@ -266,12 +262,12 @@ impl LamellaeRDMA for Rofi {
     fn remote_addr(&self, remote_pe: usize, local_addr: usize) -> usize {
         self.rofi_comm.remote_addr(remote_pe, local_addr)
     }
-    fn occupied(&self) -> usize {
-        self.rofi_comm.occupied()
-    }
-    fn num_pool_allocs(&self) -> usize {
-        self.rofi_comm.num_pool_allocs()
-    }
+    // fn occupied(&self) -> usize {
+    //     self.rofi_comm.occupied()
+    // }
+    // fn num_pool_allocs(&self) -> usize {
+    //     self.rofi_comm.num_pool_allocs()
+    // }
     fn alloc_pool(&self, min_size: usize) {
         // println!("trying to alloc pool {:?}",min_size);
         match config().heap_mode {
diff --git a/src/lamellae/shmem/shmem_comm.rs b/src/lamellae/shmem/shmem_comm.rs
index 0831fe72..2b470238 100644
--- a/src/lamellae/shmem/shmem_comm.rs
+++ b/src/lamellae/shmem/shmem_comm.rs
@@ -577,9 +577,9 @@ impl CommOps for ShmemComm {
         // println!("iget s_addr {:?} d_addr {:?} b_addr {:?}",src_addr,dst_addr.as_ptr(),self.base_addr());
         self.get(pe, src_addr, dst_addr);
     }
-    fn iget_relative<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]) {
-        self.get(pe, src_addr + self.base_addr(), dst_addr);
-    }
+    // fn iget_relative<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]) {
+    //     self.get(pe, src_addr + self.base_addr(), dst_addr);
+    // }
     fn force_shutdown(&self) {}
 }
 
@@ -610,7 +610,7 @@ pub(crate) struct ShmemData {
 }
 
 impl ShmemData {
-    pub fn new(shmem_comm: Arc<Comm>, size: usize) -> Result<ShmemData, anyhow::Error> {
+    pub(crate) fn new(shmem_comm: Arc<Comm>, size: usize) -> Result<ShmemData, anyhow::Error> {
         let ref_cnt_size = std::mem::size_of::<AtomicUsize>();
         let alloc_size = size + ref_cnt_size; //+  std::mem::size_of::<u64>();
         let relative_addr = shmem_comm.rt_alloc(alloc_size, std::mem::align_of::<AtomicUsize>())?;
diff --git a/src/lamellae/shmem_lamellae.rs b/src/lamellae/shmem_lamellae.rs
index c5ae396e..249302af 100644
--- a/src/lamellae/shmem_lamellae.rs
+++ b/src/lamellae/shmem_lamellae.rs
@@ -5,8 +5,8 @@ use crate::lamellae::command_queues::CommandQueue;
 use crate::lamellae::shmem::shmem_comm::*;
 
 use crate::lamellae::{
-    AllocationType, Backend, Comm, Des, Lamellae, LamellaeAM, LamellaeComm, LamellaeInit,
-    LamellaeRDMA, Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
+    AllocationType, Backend, Comm, Lamellae, LamellaeAM, LamellaeComm, LamellaeInit, LamellaeRDMA,
+    Ser, SerializeHeader, SerializedData, SerializedDataOps, SERIALIZE_HEADER_LEN,
 };
 use crate::lamellar_arch::LamellarArchRT;
 use crate::scheduler::Scheduler;
@@ -136,7 +136,7 @@ impl LamellaeComm for Shmem {
         // (self.shmem_comm.put_amt.load(Ordering::SeqCst) + self.shmem_comm.get_amt.load(Ordering::SeqCst)) as
         self.cq.tx_amount() as f64 / 1_000_000.0
     }
-    fn print_stats(&self) {}
+    // fn print_stats(&self) {}
     fn shutdown(&self) {
         // println!("Shmem Lamellae shuting down");
         let _ = self.active.compare_exchange(
@@ -166,10 +166,6 @@ impl LamellaeComm for Shmem {
 
 #[async_trait]
 impl LamellaeAM for Shmem {
-    async fn send_to_pe_async(&self, pe: usize, data: SerializedData) {
-        self.cq.send_data(data, pe).await;
-    } //should never send to self... this is short circuited before request is serialized in the active message layer
-
     async fn send_to_pes_async(
         &self,
         pe: Option<usize>,
@@ -190,18 +186,18 @@ impl LamellaeAM for Shmem {
 }
 
 impl Ser for Shmem {
-    fn serialize<T: serde::Serialize + ?Sized>(
-        &self,
-        header: Option<SerializeHeader>,
-        obj: &T,
-    ) -> Result<SerializedData, anyhow::Error> {
-        let header_size = *SERIALIZE_HEADER_LEN;
-        let data_size = crate::serialized_size(obj, true) as usize;
-        let ser_data = ShmemData::new(self.shmem_comm.clone(), header_size + data_size)?;
-        crate::serialize_into(ser_data.header_as_bytes(), &header, false)?; //we want header to be a fixed size
-        crate::serialize_into(ser_data.data_as_bytes(), obj, true)?;
-        Ok(SerializedData::ShmemData(ser_data))
-    }
+    // fn serialize<T: serde::Serialize + ?Sized>(
+    //     &self,
+    //     header: Option<SerializeHeader>,
+    //     obj: &T,
+    // ) -> Result<SerializedData, anyhow::Error> {
+    //     let header_size = *SERIALIZE_HEADER_LEN;
+    //     let data_size = crate::serialized_size(obj, true) as usize;
+    //     let ser_data = ShmemData::new(self.shmem_comm.clone(), header_size + data_size)?;
+    //     crate::serialize_into(ser_data.header_as_bytes(), &header, false)?; //we want header to be a fixed size
+    //     crate::serialize_into(ser_data.data_as_bytes(), obj, true)?;
+    //     Ok(SerializedData::ShmemData(ser_data))
+    // }
     fn serialize_header(
         &self,
         header: Option<SerializeHeader>,
@@ -235,9 +231,9 @@ impl LamellaeRDMA for Shmem {
     fn rt_alloc(&self, size: usize, align: usize) -> AllocResult<usize> {
         self.shmem_comm.rt_alloc(size, align)
     }
-    fn rt_check_alloc(&self, size: usize, align: usize) -> bool {
-        self.shmem_comm.rt_check_alloc(size, align)
-    }
+    // fn rt_check_alloc(&self, size: usize, align: usize) -> bool {
+    //     self.shmem_comm.rt_check_alloc(size, align)
+    // }
     fn rt_free(&self, addr: usize) {
         self.shmem_comm.rt_free(addr)
     }
@@ -256,12 +252,12 @@ impl LamellaeRDMA for Shmem {
     fn remote_addr(&self, remote_pe: usize, local_addr: usize) -> usize {
         self.shmem_comm.remote_addr(remote_pe, local_addr)
     }
-    fn occupied(&self) -> usize {
-        self.shmem_comm.occupied()
-    }
-    fn num_pool_allocs(&self) -> usize {
-        self.shmem_comm.num_pool_allocs()
-    }
+    // fn occupied(&self) -> usize {
+    //     self.shmem_comm.occupied()
+    // }
+    // fn num_pool_allocs(&self) -> usize {
+    //     self.shmem_comm.num_pool_allocs()
+    // }
     fn alloc_pool(&self, min_size: usize) {
         match config().heap_mode {
             HeapMode::Static => {
diff --git a/src/lamellar_alloc.rs b/src/lamellar_alloc.rs
index 8069382a..c1a5aa46 100644
--- a/src/lamellar_alloc.rs
+++ b/src/lamellar_alloc.rs
@@ -11,6 +11,7 @@ use std::sync::Arc;
 pub(crate) trait LamellarAlloc {
     fn new(id: String) -> Self;
     fn init(&mut self, start_addr: usize, size: usize); //size in bytes
+    #[allow(dead_code)]
     fn malloc(&self, size: usize, align: usize) -> usize;
     fn try_malloc(&self, size: usize, align: usize) -> Option<usize>;
     fn fake_malloc(&self, size: usize, align: usize) -> bool;
diff --git a/src/lamellar_arch.rs b/src/lamellar_arch.rs
index f4431b10..2d961408 100644
--- a/src/lamellar_arch.rs
+++ b/src/lamellar_arch.rs
@@ -23,9 +23,9 @@ pub trait LamellarArch: Send + Sync {
 /// An error that occurs when trying to access a PE that does not exist on a team/subteam
 #[derive(Debug, Clone, Copy)]
 pub struct IdError {
-    #[doc(hidden)]
+    /// the PE id of the parent team
     pub parent_pe: usize,
-    #[doc(hidden)]
+    /// the PE id of the current team
     pub team_pe: usize,
 }
 
@@ -63,7 +63,7 @@ impl std::fmt::Debug for LamellarArchEnum {
 }
 
 impl LamellarArchEnum {
-    pub fn new<A>(arch: A) -> LamellarArchEnum
+    pub(crate) fn new<A>(arch: A) -> LamellarArchEnum
     where
         A: LamellarArch + 'static,
     {
@@ -135,7 +135,7 @@ pub(crate) struct LamellarArchRT {
 }
 
 impl LamellarArchRT {
-    pub fn new<A>(parent: Arc<LamellarArchRT>, arch: A) -> LamellarArchRT
+    pub(crate) fn new<A>(parent: Arc<LamellarArchRT>, arch: A) -> LamellarArchRT
     where
         A: LamellarArch + 'static,
     {
@@ -159,10 +159,10 @@ impl LamellarArchRT {
             arch: LamellarArchEnum::new(arch),
         }
     }
-    pub fn num_pes(&self) -> usize {
+    pub(crate) fn num_pes(&self) -> usize {
         self.num_pes
     }
-    pub fn world_pe(&self, team_pe: usize) -> ArchResult<usize> {
+    pub(crate) fn world_pe(&self, team_pe: usize) -> ArchResult<usize> {
         let parent_pe = self.arch.parent_pe_id(&team_pe)?;
         if let Some(parent) = &self.parent {
             parent.world_pe(parent_pe)
@@ -171,7 +171,7 @@ impl LamellarArchRT {
         }
     }
 
-    pub fn team_pe(&self, world_pe: usize) -> ArchResult<usize> {
+    pub(crate) fn team_pe(&self, world_pe: usize) -> ArchResult<usize> {
         if let Some(parent) = &self.parent {
             let parent_pe = parent.team_pe(world_pe)?;
             // println!("world_pe {:?}   parent_pe {:?}  self: {:?}",world_pe, parent_pe,self);
@@ -186,7 +186,7 @@ impl LamellarArchRT {
         }
     }
 
-    pub fn team_iter(&self) -> Box<dyn Iterator<Item = usize>> {
+    pub(crate) fn team_iter(&self) -> Box<dyn Iterator<Item = usize>> {
         //return an iterator of the teams global pe ids
         Box::new(LamellarArchRTiter {
             arch: self.clone(),
@@ -195,7 +195,7 @@ impl LamellarArchRT {
         })
     }
     #[allow(dead_code)]
-    pub fn single_iter(&self, pe: usize) -> Box<dyn Iterator<Item = usize>> {
+    pub(crate) fn single_iter(&self, pe: usize) -> Box<dyn Iterator<Item = usize>> {
         //a single element iterator returning the global id of pe
         Box::new(LamellarArchRTiter {
             arch: self.clone(),
@@ -232,14 +232,14 @@ impl Iterator for LamellarArchRTiter {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Copy, Clone, std::hash::Hash, Debug)]
-pub struct GlobalArch {
+pub(crate) struct GlobalArch {
     pub(crate) num_pes: usize,
 }
 
 impl GlobalArch {
-    pub fn new(num_pes: usize) -> GlobalArch {
+    pub(crate) fn new(num_pes: usize) -> GlobalArch {
         GlobalArch { num_pes: num_pes }
     }
 }
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index bb9cde70..9a45d5c5 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -58,7 +58,7 @@ pub(crate) enum InternalResult {
 //     }
 // }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 // #[enum_dispatch]
 pub(crate) trait LamellarRequest: Future {
     fn blocking_wait(self) -> Self::Output;
@@ -66,7 +66,7 @@ pub(crate) trait LamellarRequest: Future {
     fn val(&self) -> Self::Output;
 }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // #[async_trait]
 // pub trait LamellarMultiRequest: Sync + Send {
 //     type Output;
@@ -250,7 +250,7 @@ impl LamellarRequestResult {
 // // we use the ready bool to protect access to the data field
 // unsafe impl Sync for LamellarRequestHandleInner {}
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // #[derive(Debug)]
 // pub struct LamellarRequestHandle<T: AmDist> {
 //     pub(crate) inner: Arc<LamellarRequestHandleInner>,
@@ -381,7 +381,7 @@ impl LamellarRequestResult {
 //     pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
 // }
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // #[derive(Debug)]
 // pub struct LamellarMultiRequestHandle<T: AmDist> {
 //     pub(crate) inner: Arc<LamellarMultiRequestHandleInner>,
@@ -529,7 +529,7 @@ impl LamellarRequestResult {
 // // we use the ready bool to protect access to the data field
 // unsafe impl Sync for LamellarLocalRequestHandleInner {}
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // #[derive(Debug)]
 // pub struct LamellarLocalRequestHandle<T> {
 //     pub(crate) inner: Arc<LamellarLocalRequestHandleInner>,
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index d2783f5f..72bbf946 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -36,7 +36,7 @@ pub(crate) struct TaskGroupAmHandleInner {
     pub(crate) scheduler: Arc<Scheduler>,
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
 pub struct TaskGroupAmHandle<T: AmDist> {
@@ -191,7 +191,7 @@ pub(crate) struct TaskGroupMultiAmHandleInner {
     pub(crate) scheduler: Arc<Scheduler>,
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
 pub struct TaskGroupMultiAmHandle<T: AmDist> {
@@ -370,7 +370,7 @@ impl<T: AmDist> Future for TaskGroupMultiAmHandle<T> {
     }
 }
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
 pub struct TaskGroupLocalAmHandle<T> {
@@ -540,6 +540,10 @@ impl ActiveMessaging for LamellarTaskGroup {
         self.wait_all();
     }
 
+    fn await_all(&self) -> impl std::future::Future<Output = ()> + Send {
+        self.await_all()
+    }
+
     //#[tracing::instrument(skip_all)]
     fn barrier(&self) {
         self.team.barrier();
@@ -667,7 +671,7 @@ impl LamellarTaskGroup {
         }
     }
 
-    pub async fn await_all(&self) {
+    async fn await_all(&self) {
         let mut temp_now = Instant::now();
         while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
             // self.team.flush();
@@ -1563,7 +1567,7 @@ impl<T: AmDist> BaseAmGroupReq<T> {
 
 /// This enum is used to hold the results of a TypedAmGroup request
 #[derive(Clone)]
-pub enum BaseAmGroupResult<T> {
+pub(crate) enum BaseAmGroupResult<T> {
     // T here should be the inner most return type
     /// AmGroup executed on a single PE, and does not return any value
     SinglePeUnit(T),
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 2f53ce08..7586e295 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -355,6 +355,7 @@ impl LamellarTeam {
 
     #[doc(alias = "Collective")]
     /// team wide synchronization method which blocks calling thread until all PEs in the team have entered
+    /// Generally this is intended to be called from the main thread, if a barrier is needed within an active message or async context please see [async_barrier](Self::async_barrier)
     ///
     /// # Collective Operation
     /// Requrires all PEs present within the team to enter the barrier otherwise deadlock will occur.
@@ -381,12 +382,29 @@ impl LamellarTeam {
         self.team.barrier()
     }
 
+    #[doc(alias = "Collective")]
+    /// EXPERIMENTAL: team wide synchronization method which blocks the calling task until all PEs in team have entered.
+    /// This function allows for calling barrier in an async context without blocking the worker thread.
+    /// Care should be taken when using this function to avoid deadlocks,as it is easy to mismatch barrier calls accross threads and PEs.
+    ///
+    /// # Collective Operation
+    /// Requrires all PEs present within the team to enter the barrier otherwise deadlock will occur.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::active_messaging::prelude::*;
+    ///
+    /// let world = lamellar::LamellarWorldBuilder::new().build();
+    /// //do some work
+    /// world.barrier(); //block until all PEs have entered the barrier
+    ///```
     pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         self.team.async_barrier()
     }
 
+    //used by proc macro
     #[doc(hidden)]
     pub fn exec_am_group_pe<F, O>(&self, pe: usize, am: F) -> AmHandle<O>
     where
@@ -396,6 +414,7 @@ impl LamellarTeam {
         self.team.am_group_exec_am_pe_tg(pe, am, None)
     }
 
+    //used by proc macro
     #[doc(hidden)]
     pub fn exec_am_group_all<F, O>(&self, am: F) -> MultiAmHandle<O>
     where
@@ -485,6 +504,12 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.wait_all();
     }
 
+    fn await_all(&self) -> impl std::future::Future<Output = ()> + Send {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+
+        self.team.await_all()
+    }
+
     //#[tracing::instrument(skip_all)]
     fn barrier(&self) {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
@@ -602,6 +627,8 @@ impl From<LamellarWorld> for IntoLamellarTeam {
     }
 }
 
+// Intenal Runtime handle to a lamellar team
+// users generally don't need to use this
 #[doc(hidden)]
 pub struct ArcLamellarTeam {
     pub team: Arc<LamellarTeam>,
@@ -671,6 +698,9 @@ impl From<Pin<Arc<LamellarTeamRT>>> for LamellarTeamRemotePtr {
     }
 }
 
+// Internal Runtime handle to a lamellar team
+// used by proc macros
+// users should never need to use this
 #[doc(hidden)]
 pub struct LamellarTeamRT {
     #[allow(dead_code)]
@@ -1333,6 +1363,27 @@ impl LamellarTeamRT {
             }
         }
     }
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        while self.panic.load(Ordering::SeqCst) == 0
+            && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                || (self.parent.is_none()
+                    && self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0))
+        {
+            // std::thread::yield_now();
+            // self.flush();
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
+                println!(
+                    "in team wait_all mype: {:?} cnt: {:?} {:?}",
+                    self.world_pe,
+                    self.team_counters.send_req_cnt.load(Ordering::SeqCst),
+                    self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
+                );
+                temp_now = Instant::now();
+            }
+        }
+    }
 
     pub(crate) fn block_on<F>(&self, f: F) -> F::Output
     where
@@ -1816,6 +1867,7 @@ impl LamellarTeamRT {
         .into()
     }
 
+    #[allow(dead_code)]
     pub(crate) async fn exec_arc_am_pe_immediately<F>(
         self: &Pin<Arc<LamellarTeamRT>>,
         pe: usize,
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 987c674e..30df829f 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -72,6 +72,9 @@ impl ActiveMessaging for LamellarWorld {
     fn wait_all(&self) {
         self.team.wait_all();
     }
+    fn await_all(&self) -> impl std::future::Future<Output = ()> + Send {
+        self.team.await_all()
+    }
     //#[tracing::instrument(skip_all)]
     fn barrier(&self) {
         self.team.barrier();
@@ -187,8 +190,15 @@ impl LamellarWorld {
         }
     }
 
-    #[doc(hidden)]
-    //#[tracing::instrument(skip_all)]
+    #[doc(alias("One-sided", "onesided"))] //#[tracing::instrument(skip_all)]
+    /// Returns the underlying [LamellarTeam] for this world
+    /// # Examples
+    ///```
+    /// use lamellar::active_messaging::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let team = world.team();
+    ///```
     pub fn team(&self) -> Arc<LamellarTeam> {
         self.team.clone()
     }
@@ -210,9 +220,9 @@ impl LamellarWorld {
         self.team.num_threads_per_pe()
     }
 
-    pub fn flush(&self) {
-        self.team_rt.flush();
-    }
+    // pub fn flush(&self) {
+    //     self.team_rt.flush();
+    // }
 }
 
 impl LamellarEnv for LamellarWorld {
diff --git a/src/lib.rs b/src/lib.rs
index 10c129d3..eea4d2d9 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,5 @@
 #![warn(missing_docs)]
+#![warn(unreachable_pub)]
 
 //! Lamellar is an investigation of the applicability of the Rust systems programming language for HPC as an alternative to C and C++, with a focus on PGAS approaches.
 //!
@@ -224,7 +225,7 @@
 //!         - `rofi`
 //! - `LAMELLAR_MEM_SIZE` - Specify the initial size of the Runtime "RDMAable" memory pool. Defaults to 1GB
 //!     - `export LAMELLAR_MEM_SIZE=$((20*1024*1024*1024))` 20GB memory pool
-//!     - Internally, Lamellar utilizes memory pools of RDMAable memory for Runtime data structures (e.g. [Darcs][crate::Darc], [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion],etc), aggregation buffers, and message queues. Additional memory pools are dynamically allocated across the system as needed. This can be a fairly expensive operation (as the operation is synchronous across all PEs) so the runtime will print a message at the end of execution with how many additional pools were allocated.
+//!     - Internally, Lamellar utilizes memory pools of RDMAable memory for Runtime data structures (e.g. [Darcs][crate::Darc], [OneSidedMemoryRegion],etc), aggregation buffers, and message queues. Additional memory pools are dynamically allocated across the system as needed. This can be a fairly expensive operation (as the operation is synchronous across all PEs) so the runtime will print a message at the end of execution with how many additional pools were allocated.
 //!         - if you find you are dynamically allocating new memory pools, try setting `LAMELLAR_MEM_SIZE` to a larger value
 //!     - Note: when running multiple PEs on a single system, the total allocated memory for the pools would be equal to `LAMELLAR_MEM_SIZE * number of processes`
 //!
@@ -233,37 +234,37 @@
 extern crate lazy_static;
 #[macro_use]
 extern crate memoffset;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub extern crate serde;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use serde::*;
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 pub extern crate serde_with;
 // pub use serde_with::*;
 
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub extern crate tracing;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use parking_lot;
-// #[doc(hidden)]
+// //#[doc(hidden)]
 // pub use tracing::*;
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use async_trait;
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use futures_util;
 
 pub mod active_messaging;
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use active_messaging::prelude::*;
 pub mod array;
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use array::prelude::*;
 mod barrier;
 pub mod darc;
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use darc::prelude::*;
 mod lamellae;
 mod lamellar_alloc;
@@ -275,11 +276,11 @@ mod lamellar_task_group;
 mod lamellar_team;
 mod lamellar_world;
 pub mod memregion;
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use memregion::prelude::*;
 mod scheduler;
 mod utils;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use utils::*;
 
 mod env_var;
@@ -287,37 +288,37 @@ pub use env_var::config;
 
 pub use crate::lamellae::Backend;
 pub use crate::lamellar_arch::{BlockedArch, IdError, LamellarArch, StridedArch};
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use crate::lamellar_task_group::{
     AmGroup, AmGroupResult, BaseAmGroupReq, LamellarTaskGroup, TypedAmGroupBatchReq,
     TypedAmGroupBatchResult, TypedAmGroupResult,
 };
 pub use crate::lamellar_team::LamellarTeam;
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use crate::lamellar_team::{ArcLamellarTeam, LamellarTeamRT};
 pub use crate::lamellar_world::*;
 pub use crate::scheduler::ExecutorType;
 
 extern crate lamellar_impl;
-#[doc(hidden)]
+// //#[doc(hidden)]
 pub use lamellar_impl::Dist;
 // use lamellar_impl;
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use inventory;
 
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use bincode;
 use bincode::Options;
 
 // #[macro_use]
 // pub extern crate custom_derive;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use custom_derive;
 
 // #[macro_use]
 // pub extern crate newtype_derive;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use newtype_derive;
 
 lazy_static! {
@@ -336,7 +337,7 @@ lazy_static! {
 //         thread_local::ThreadLocal::new();
 // }
 
-#[doc(hidden)]
+/// Wrapper function for serializing data
 pub fn serialize<T: ?Sized>(obj: &T, var: bool) -> Result<Vec<u8>, anyhow::Error>
 where
     T: serde::Serialize,
@@ -356,7 +357,7 @@ where
     res
 }
 
-#[doc(hidden)]
+/// Wrapper function for getting the size of serialized data
 pub fn serialized_size<T: ?Sized>(obj: &T, var: bool) -> usize
 where
     T: serde::Serialize,
@@ -375,7 +376,8 @@ where
     // }
     res
 }
-#[doc(hidden)]
+
+/// Wrapper function for serializing an object into a buffer
 pub fn serialize_into<T: ?Sized>(buf: &mut [u8], obj: &T, var: bool) -> Result<(), anyhow::Error>
 where
     T: serde::Serialize,
@@ -395,7 +397,7 @@ where
     Ok(())
 }
 
-#[doc(hidden)]
+/// Wrapper function for deserializing data
 pub fn deserialize<'a, T>(bytes: &'a [u8], var: bool) -> Result<T, anyhow::Error>
 where
     T: serde::Deserialize<'a>,
@@ -414,5 +416,5 @@ where
     // }
     res
 }
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use async_std;
diff --git a/src/memregion.rs b/src/memregion.rs
index dfdbfa6e..a5bda9fe 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -17,7 +17,8 @@ use core::marker::PhantomData;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
-#[doc(hidden)]
+//#[doc(hidden)]
+/// Prelude for using the [LamellarMemoryRegion] module
 pub mod prelude;
 
 pub(crate) mod shared;
@@ -63,12 +64,15 @@ pub trait Dist:
 // {
 // }
 
-#[doc(hidden)]
+//#[doc(hidden)]
+/// Enum used to expose common methods for all registered memory regions
 #[enum_dispatch(RegisteredMemoryRegion<T>, MemRegionId, AsBase, MemoryRegionRDMA<T>, RTMemoryRegionRDMA<T>, LamellarEnv)]
 #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
 #[serde(bound = "T: Dist + serde::Serialize + serde::de::DeserializeOwned")]
 pub enum LamellarMemoryRegion<T: Dist> {
+    ///
     Shared(SharedMemoryRegion<T>),
+    ///
     Local(OneSidedMemoryRegion<T>),
     // Unsafe(UnsafeArray<T>),
 }
@@ -115,6 +119,8 @@ impl<T: Dist> crate::active_messaging::DarcSerde for LamellarMemoryRegion<T> {
 
 impl<T: Dist> LamellarMemoryRegion<T> {
     //#[tracing::instrument(skip_all)]
+    /// If the memory region contains local data, return it as a mutable slice
+    /// else return an error
     pub unsafe fn as_mut_slice(&self) -> MemResult<&mut [T]> {
         match self {
             LamellarMemoryRegion::Shared(memregion) => memregion.as_mut_slice(),
@@ -124,6 +130,8 @@ impl<T: Dist> LamellarMemoryRegion<T> {
     }
 
     //#[tracing::instrument(skip_all)]
+    /// if the memory region contains local data, return it as a slice
+    /// else return an error
     pub unsafe fn as_slice(&self) -> MemResult<&[T]> {
         match self {
             LamellarMemoryRegion::Shared(memregion) => memregion.as_slice(),
@@ -280,6 +288,8 @@ pub trait RegisteredMemoryRegion<T: Dist> {
     /// assert_eq!(mem_region.len(),1000);
     ///```
     fn len(&self) -> usize;
+
+    //TODO: move this function to a private trait or private method
     #[doc(hidden)]
     fn addr(&self) -> MemResult<usize>;
 
@@ -404,9 +414,12 @@ pub(crate) trait MemRegionId {
 // because we want MemRegion to impl RegisteredMemoryRegion (so that it can be used in Shared + Local)
 // but MemRegion should not return LamellarMemoryRegions directly (as both SubRegion and AsBase require)
 // we will implement seperate functions for MemoryRegion itself.
-#[doc(hidden)]
+//#[doc(hidden)]
+
+/// Trait for creating subregions of a memory region
 #[enum_dispatch]
 pub trait SubRegion<T: Dist> {
+    #[doc(hidden)]
     type Region: RegisteredMemoryRegion<T> + MemoryRegionRDMA<T>;
     #[doc(alias("One-sided", "onesided"))]
     /// Create a sub region of this RegisteredMemoryRegion using the provided range
diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index dcdedff2..4890d0ea 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -26,7 +26,7 @@ lazy_static! {
 
 static ID_COUNTER: AtomicUsize = AtomicUsize::new(0);
 
-#[doc(hidden)]
+//#[doc(hidden)]
 #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
 pub struct NetMemRegionHandle {
     mr_addr: usize,
diff --git a/src/memregion/prelude.rs b/src/memregion/prelude.rs
index 7cdf9434..a941b3ab 100644
--- a/src/memregion/prelude.rs
+++ b/src/memregion/prelude.rs
@@ -5,7 +5,7 @@ pub use crate::memregion::{
 
 pub use crate::active_messaging::ActiveMessaging;
 pub use crate::lamellar_team::LamellarTeam;
-#[doc(hidden)]
+//#[doc(hidden)]
 pub use crate::lamellar_team::LamellarTeamRT;
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 7774654b..e90f9990 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -78,8 +78,11 @@ pub(crate) struct ReqId {
 pub enum ExecutorType {
     /// The default work stealing executor
     LamellarWorkStealing,
+    /// Experimental numa-aware(ish) work stealing executor
     LamellarWorkStealing2,
+    /// Experimental numa-aware(ish) work stealing executor
     LamellarWorkStealing3,
+    /// executor provided by the AsyncStd crate
     AsyncStd,
     #[cfg(feature = "tokio-executor")]
     /// The tokio executor
@@ -113,7 +116,7 @@ pub(crate) trait LamellarExecutor {
 
     fn block_on<F: Future>(&self, future: F) -> F::Output;
 
-    fn set_max_workers(&mut self, num_workers: usize);
+    // fn set_max_workers(&mut self, num_workers: usize);
     fn num_workers(&self) -> usize;
     fn shutdown(&self);
     fn force_shutdown(&self);
diff --git a/src/scheduler/async_std_executor.rs b/src/scheduler/async_std_executor.rs
index e6104393..89a35909 100644
--- a/src/scheduler/async_std_executor.rs
+++ b/src/scheduler/async_std_executor.rs
@@ -60,9 +60,9 @@ impl LamellarExecutor for AsyncStdRt {
         // I dont think tokio has a way to do this
     }
 
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
+    // fn set_max_workers(&mut self, num_workers: usize) {
+    //     self.max_num_threads = num_workers;
+    // }
 
     fn num_workers(&self) -> usize {
         self.max_num_threads
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index a6edad09..eafd942b 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -61,9 +61,9 @@ impl LamellarExecutor for TokioRt {
         // I dont think tokio has a way to do this
     }
 
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
+    // fn set_max_workers(&mut self, num_workers: usize) {
+    //     self.max_num_threads = num_workers;
+    // }
 
     fn num_workers(&self) -> usize {
         self.max_num_threads
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index fdf674c7..f20a8b56 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -267,9 +267,9 @@ impl LamellarExecutor for WorkStealing {
         }
     }
 
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
+    // fn set_max_workers(&mut self, num_workers: usize) {
+    //     self.max_num_threads = num_workers;
+    // }
 
     fn num_workers(&self) -> usize {
         self.max_num_threads
diff --git a/src/scheduler/work_stealing2.rs b/src/scheduler/work_stealing2.rs
index 64e958d0..22b1b4ee 100644
--- a/src/scheduler/work_stealing2.rs
+++ b/src/scheduler/work_stealing2.rs
@@ -144,72 +144,72 @@ impl WorkStealingThread {
     }
 }
 
-#[derive(Debug)]
-pub(crate) struct IoThread {
-    io_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
-    io_q: Worker<Runnable<usize>>,
-    status: Arc<AtomicU8>,
-    panic: Arc<AtomicU8>,
-}
-
-impl IoThread {
-    //#[tracing::instrument(skip_all)]
-    fn run(worker: IoThread, active_cnt: Arc<AtomicUsize>, id: CoreId) -> thread::JoinHandle<()> {
-        let builder = thread::Builder::new().name("io_thread".into());
-        builder
-            .spawn(move || {
-                core_affinity::set_for_current(id);
-                active_cnt.fetch_add(1, Ordering::SeqCst);
-                let mut timer = std::time::Instant::now();
-                while worker.panic.load(Ordering::SeqCst) == 0
-                    && (worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
-                        || !(worker.io_q.is_empty() && worker.io_inj.is_empty()))
-                {
-                    let io_task = worker
-                        .io_q
-                        .pop()
-                        .or_else(|| worker.io_inj.steal_batch_and_pop(&worker.io_q).success());
-                    if let Some(runnable) = io_task {
-                        if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                            && timer.elapsed().as_secs_f64() > config().deadlock_timeout
-                        {
-                            println!(
-                                "io_q size {:?} io inj size {:?} ", // num_tasks {:?}",
-                                worker.io_q.len(),
-                                worker.io_inj.len(),
-                                // num_tasks.load(Ordering::SeqCst)
-                            );
-                            timer = std::time::Instant::now();
-                        }
-                        runnable.run();
-                    }
-
-                    if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
-                        && timer.elapsed().as_secs_f64() > config().deadlock_timeout
-                        && (worker.io_q.len() > 0 || worker.io_inj.len() > 0)
-                    {
-                        println!(
-                            "io_q size {:?} io inj size {:?} ", // num_tasks {:?}",
-                            worker.io_q.len(),
-                            worker.io_inj.len(),
-                            // num_tasks.load(Ordering::SeqCst)
-                        );
-                        timer = std::time::Instant::now();
-                    }
-                    std::thread::yield_now();
-                }
-                active_cnt.fetch_sub(1, Ordering::SeqCst);
-            })
-            .unwrap()
-    }
-}
+// #[derive(Debug)]
+// pub(crate) struct IoThread {
+//     io_inj: Arc<crossbeam::deque::Injector<Runnable<usize>>>,
+//     io_q: Worker<Runnable<usize>>,
+//     status: Arc<AtomicU8>,
+//     panic: Arc<AtomicU8>,
+// }
+
+// impl IoThread {
+//     //#[tracing::instrument(skip_all)]
+//     fn run(worker: IoThread, active_cnt: Arc<AtomicUsize>, id: CoreId) -> thread::JoinHandle<()> {
+//         let builder = thread::Builder::new().name("io_thread".into());
+//         builder
+//             .spawn(move || {
+//                 core_affinity::set_for_current(id);
+//                 active_cnt.fetch_add(1, Ordering::SeqCst);
+//                 let mut timer = std::time::Instant::now();
+//                 while worker.panic.load(Ordering::SeqCst) == 0
+//                     && (worker.status.load(Ordering::SeqCst) == SchedulerStatus::Active as u8
+//                         || !(worker.io_q.is_empty() && worker.io_inj.is_empty()))
+//                 {
+//                     let io_task = worker
+//                         .io_q
+//                         .pop()
+//                         .or_else(|| worker.io_inj.steal_batch_and_pop(&worker.io_q).success());
+//                     if let Some(runnable) = io_task {
+//                         if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+//                             && timer.elapsed().as_secs_f64() > config().deadlock_timeout
+//                         {
+//                             println!(
+//                                 "io_q size {:?} io inj size {:?} ", // num_tasks {:?}",
+//                                 worker.io_q.len(),
+//                                 worker.io_inj.len(),
+//                                 // num_tasks.load(Ordering::SeqCst)
+//                             );
+//                             timer = std::time::Instant::now();
+//                         }
+//                         runnable.run();
+//                     }
+
+//                     if worker.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8
+//                         && timer.elapsed().as_secs_f64() > config().deadlock_timeout
+//                         && (worker.io_q.len() > 0 || worker.io_inj.len() > 0)
+//                     {
+//                         println!(
+//                             "io_q size {:?} io inj size {:?} ", // num_tasks {:?}",
+//                             worker.io_q.len(),
+//                             worker.io_inj.len(),
+//                             // num_tasks.load(Ordering::SeqCst)
+//                         );
+//                         timer = std::time::Instant::now();
+//                     }
+//                     std::thread::yield_now();
+//                 }
+//                 active_cnt.fetch_sub(1, Ordering::SeqCst);
+//             })
+//             .unwrap()
+//     }
+// }
 
 #[derive(Debug)]
 pub(crate) struct WorkStealing2 {
     max_num_threads: usize,
     threads: Vec<thread::JoinHandle<()>>,
     imm_inj: Arc<Injector<Runnable<usize>>>,
-    io_inj: Arc<Injector<Runnable<usize>>>,
+    // io_inj: Arc<Injector<Runnable<usize>>>,
     work_injs: Vec<Arc<Injector<Runnable<usize>>>>,
     work_stealers: Vec<Stealer<Runnable<usize>>>,
     work_flag: Arc<AtomicU8>,
@@ -351,9 +351,9 @@ impl LamellarExecutor for WorkStealing2 {
         }
     }
 
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
+    // fn set_max_workers(&mut self, num_workers: usize) {
+    //     self.max_num_threads = num_workers;
+    // }
 
     fn num_workers(&self) -> usize {
         self.max_num_threads
@@ -367,7 +367,7 @@ impl WorkStealing2 {
         panic: Arc<AtomicU8>,
     ) -> WorkStealing2 {
         // println!("new work stealing queue");
-        let num_workers =  std::cmp::max(1,num_workers-1);
+        let num_workers = std::cmp::max(1, num_workers - 1);
         let mut num_threads_per_group = match std::env::var("LAMELLAR_WS2_THREADS") {
             Ok(s) => {
                 if let Ok(num) = s.parse::<usize>() {
@@ -378,15 +378,15 @@ impl WorkStealing2 {
             }
             _ => 4,
         };
-        if num_threads_per_group > num_workers  {
-            num_threads_per_group = num_workers 
+        if num_threads_per_group > num_workers {
+            num_threads_per_group = num_workers
         }
 
         let mut ws = WorkStealing2 {
             max_num_threads: num_workers,
             threads: Vec::new(),
             imm_inj: Arc::new(Injector::new()),
-            io_inj: Arc::new(Injector::new()),
+            // io_inj: Arc::new(Injector::new()),
             work_injs: Vec::new(),
             work_stealers: Vec::new(),
             work_flag: Arc::new(AtomicU8::new(0)),
@@ -446,7 +446,7 @@ impl WorkStealing2 {
             .enumerate()
         {
             // println!("init group {} {:?}", group_id, group_stealers.len());
-            let work_flag = Arc::new(AtomicU8::new(0));
+            let work_flag = self.work_flag.clone();
             for _ in 0..group_stealers.len() {
                 let group_queue = TaskQueue {
                     tasks: work_workers.pop().unwrap(),
diff --git a/src/scheduler/work_stealing3.rs b/src/scheduler/work_stealing3.rs
index 636c9216..405de9ff 100644
--- a/src/scheduler/work_stealing3.rs
+++ b/src/scheduler/work_stealing3.rs
@@ -266,8 +266,8 @@ impl LamellarExecutor for WorkStealing3 {
 
     //#[tracing::instrument(skip_all)]
     fn exec_task(&self) {
-        let mut rng = rand::thread_rng();
-        let t = rand::distributions::Uniform::from(0..self.work_stealers.len());
+        let mut _rng = rand::thread_rng();
+        let _t = rand::distributions::Uniform::from(0..self.work_stealers.len());
         let ret = if !self.imm_inj.is_empty() {
             self.imm_inj.steal().success()
         } else {
@@ -291,9 +291,9 @@ impl LamellarExecutor for WorkStealing3 {
         }
     }
 
-    fn set_max_workers(&mut self, num_workers: usize) {
-        self.max_num_threads = num_workers;
-    }
+    // fn set_max_workers(&mut self, num_workers: usize) {
+    //     self.max_num_threads = num_workers;
+    // }
 
     fn num_workers(&self) -> usize {
         self.max_num_threads
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index c1a27f03..930ef56f 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -4,7 +4,7 @@ use lamellar::memregion::prelude::*;
 use rand::distributions::Distribution;
 use rand::distributions::Uniform;
 
-use std::ops::Deref;
+// use std::ops::Deref;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {

From 584e8c592d61f05ed7b14b5fe4f31d8dca491534 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 16 Jul 2024 09:29:19 -0700
Subject: [PATCH 044/116] fixes integer overflow issue via wrapping_add, fixes
 #44

---
 src/lamellae/command_queues.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index e3817d95..bfd06d17 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -114,7 +114,11 @@ impl CmdMsg {
     }
     //#[tracing::instrument(skip_all)]
     fn hash(&self) -> usize {
-        let mut res = self.daddr + self.dsize + self.cmd as usize + self.msg_hash;
+        let mut res = self
+            .daddr
+            .wrapping_add(self.dsize)
+            .wrapping_add(self.cmd as usize)
+            .wrapping_add(self.msg_hash);
         if res == 0 {
             res = 1
         }

From da17fce36e357076684fa2ba90d8f7ddf7419f99 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 16 Jul 2024 09:29:54 -0700
Subject: [PATCH 045/116] new envrionment variable to specify a libfabric
 domain

---
 src/env_var.rs                 | 12 ++++++++++++
 src/lamellae.rs                | 14 +++-----------
 src/lamellae/rofi/rofi_api.rs  |  7 +++++--
 src/lamellae/rofi/rofi_comm.rs |  4 ++--
 src/lamellae/rofi_lamellae.rs  |  4 ++--
 5 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/env_var.rs b/src/env_var.rs
index dc7235be..87f6b2d4 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -83,6 +83,14 @@ fn default_batch_am_size() -> usize {
     100000
 }
 
+fn default_rofi_provider() -> String {
+    "verbs".to_owned()
+}
+
+fn default_rofi_domain() -> String {
+    "".to_owned()
+}
+
 #[derive(Deserialize, Debug)]
 pub struct Config {
     /// A general timeout in seconds for various operations which may indicate a deadlock, default: 600.0 seconds
@@ -130,6 +138,10 @@ pub struct Config {
     pub cmd_buf_cnt: usize,
     #[serde(default = "default_batch_am_size")]
     pub batch_am_size: usize, //the threshold for an activemessage (in bytes) on whether it will be sent directly or aggregated
+    #[serde(default = "default_rofi_provider")]
+    pub rofi_provider: String,
+    #[serde(default = "default_rofi_domain")]
+    pub rofi_domain: String,
 }
 
 /// Get the current Environment Variable configuration
diff --git a/src/lamellae.rs b/src/lamellae.rs
index e8b70a95..8e810524 100755
--- a/src/lamellae.rs
+++ b/src/lamellae.rs
@@ -226,17 +226,9 @@ pub(crate) fn create_lamellae(backend: Backend) -> LamellaeBuilder {
     match backend {
         #[cfg(feature = "enable-rofi")]
         Backend::Rofi => {
-            let provider = match std::env::var("LAMELLAR_ROFI_PROVIDER") {
-                Ok(p) => p,
-                Err(_) => "verbs".to_owned(),
-                // Ok(p) => match p.as_str() {
-                //     "verbs" => "verbs",
-                //     "tcp" => "tcp",
-                //     _ => "verbs",
-                // },
-                // Err(_) => "verbs",
-            };
-            LamellaeBuilder::RofiBuilder(RofiBuilder::new(&provider))
+            let provider = config().rofi_provider.clone();
+            let domain = config().rofi_domain.clone();
+            LamellaeBuilder::RofiBuilder(RofiBuilder::new(&provider, &domain))
         }
         Backend::Shmem => LamellaeBuilder::ShmemBuilder(ShmemBuilder::new()),
         Backend::Local => LamellaeBuilder::Local(Local::new()),
diff --git a/src/lamellae/rofi/rofi_api.rs b/src/lamellae/rofi/rofi_api.rs
index 4fb245cd..95762eb5 100644
--- a/src/lamellae/rofi/rofi_api.rs
+++ b/src/lamellae/rofi/rofi_api.rs
@@ -4,9 +4,12 @@ use crate::lamellae::AllocationType;
 use std::ffi::CString;
 use std::os::raw::c_ulong;
 
-pub(crate) fn rofi_init(provider: &str) -> Result<(), &'static str> {
+pub(crate) fn rofi_init(provider: &str, domain: &str) -> Result<(), &'static str> {
     let prov_str = CString::new(provider).unwrap();
-    let retval = unsafe { rofisys::rofi_init(prov_str.as_ptr() as *mut _, 0 as *mut _) as i32 };
+    let domain_str = CString::new(domain).unwrap();
+    let retval = unsafe {
+        rofisys::rofi_init(prov_str.as_ptr() as *mut _, domain_str.as_ptr() as *mut _) as i32
+    };
     if retval == 0 {
         Ok(())
     } else {
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index 616d4a93..94473cf7 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -51,7 +51,7 @@ pub(crate) struct RofiComm {
 
 impl RofiComm {
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn new(provider: &str) -> RofiComm {
+    pub(crate) fn new(provider: &str, domain: &str) -> RofiComm {
         if let Some(size) = config().heap_size {
             // if let Ok(size) = std::env::var("LAMELLAR_MEM_SIZE") {
             // let size = size
@@ -59,7 +59,7 @@ impl RofiComm {
             //     .expect("invalid memory size, please supply size in bytes");
             ROFI_MEM.store(size, Ordering::SeqCst);
         }
-        rofi_init(provider).expect("error in rofi init");
+        rofi_init(provider, domain).expect("error in rofi init");
         // trace!("rofi initialized");
         rofi_barrier();
         let num_pes = rofi_get_size();
diff --git a/src/lamellae/rofi_lamellae.rs b/src/lamellae/rofi_lamellae.rs
index 71b21b6a..c9e44b2d 100644
--- a/src/lamellae/rofi_lamellae.rs
+++ b/src/lamellae/rofi_lamellae.rs
@@ -22,8 +22,8 @@ pub(crate) struct RofiBuilder {
 }
 
 impl RofiBuilder {
-    pub(crate) fn new(provider: &str) -> RofiBuilder {
-        let rofi_comm: Arc<Comm> = Arc::new(RofiComm::new(provider).into());
+    pub(crate) fn new(provider: &str, domain: &str) -> RofiBuilder {
+        let rofi_comm: Arc<Comm> = Arc::new(RofiComm::new(provider, domain).into());
         RofiBuilder {
             my_pe: rofi_comm.my_pe(),
             num_pes: rofi_comm.num_pes(),

From 3262bba6f30c28a172238308f7dac865cf9dc185 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Jul 2024 11:42:57 -0700
Subject: [PATCH 046/116] additional warnings for calling blocking code in
 async contexts, updated envrionment variable documentation

---
 README.md                                     |  16 +-
 examples/darc_examples/darc.rs                |   2 +-
 src/active_messaging.rs                       |   2 +-
 .../registered_active_message.rs              |  10 +-
 src/array/global_lock_atomic.rs               |  56 +++++++
 src/array/global_lock_atomic/rdma.rs          |   2 +-
 src/array/local_lock_atomic.rs                |  48 +++++-
 src/array/local_lock_atomic/local_chunks.rs   |  21 +++
 src/array/unsafe.rs                           |  11 ++
 src/array/unsafe/operations.rs                |   6 +-
 src/array/unsafe/rdma.rs                      |  10 +-
 src/barrier.rs                                |   8 +-
 src/darc.rs                                   | 125 +++++++++++++-
 src/darc/global_rw_darc.rs                    | 145 +++++++++++++++-
 src/darc/local_rw_darc.rs                     | 158 +++++++++++++++++-
 src/env_var.rs                                |  60 ++++++-
 src/lamellar_task_group.rs                    |  17 +-
 src/lamellar_team.rs                          |  17 +-
 src/lib.rs                                    |  22 +--
 19 files changed, 663 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index de734494..2aae3b6d 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,8 @@ Currently the inverse is true, if it compiles and runs using `rofi` it will comp
 
 Additional information on using each of the lamellae backends can be found below in the `Running Lamellar Applications` section
 
+# Environment Variables
+
 Examples 
 --------
 Our repository also provides numerous examples highlighting various features of the runtime: <https://github.com/pnnl/lamellar-runtime/tree/master/examples>
@@ -209,20 +211,6 @@ There are a number of ways to run Lamellar applications, mostly dictated by the
     - ```srun -N 2 -mpi=pmi2 ./target/release/<appname>``` 
         - `pmi2` library is required to grab info about the allocated nodes and helps set up initial handshakes
 
-# Environment Variables
-Lamellar exposes a number of environment variables that can used to control application execution at runtime
-- `LAMELLAR_THREADS` - The number of worker threads used within a lamellar PE
-    -  `export LAMELLAR_THREADS=10`
-- `LAMELLAE_BACKEND` - the backend used during execution. Note that if a backend is explicitly set in the world builder, this variable is ignored.
-    - possible values
-        - `local` 
-        - `shmem` 
-        - `rofi`
-- `LAMELLAR_MEM_SIZE` - Specify the initial size of the Runtime "RDMAable" memory pool. Defaults to 1GB
-    - `export LAMELLAR_MEM_SIZE=$((20*1024*1024*1024))` 20GB memory pool
-    - Internally, Lamellar utilizes memory pools of RDMAable memory for Runtime data structures (e.g. [Darcs][crate::Darc], [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion],etc), aggregation buffers, and message queues. Additional memory pools are dynamically allocated across the system as needed. This can be a fairly expensive operation (as the operation is synchronous across all PEs) so the runtime will print a message at the end of execution with how many additional pools were allocated. 
-        - if you find you are dynamically allocating new memory pools, try setting `LAMELLAR_MEM_SIZE` to a larger value
-    - Note: when running multiple PEs on a single system, the total allocated memory for the pools would be equal to `LAMELLAR_MEM_SIZE * number of processes`
 
 NEWS
 ----
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index db50e19b..334f2c7a 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -122,7 +122,7 @@ fn main() {
     // drop(darc2);
     // drop(wrapped);
     println!("changing darc type");
-    let ro_darc = global_darc.into_localrw().into_darc(); // we can call into_darc directly on global_Darc, but string the operations for testing purposes
+    let ro_darc = global_darc.blocking_into_localrw().blocking_into_darc(); // we can call into_darc directly on global_Darc, but string the operations for testing purposes
     println!("read only darc");
     ro_darc.print();
     println!("done");
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 8e44bbab..75f62497 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -662,7 +662,7 @@ pub(crate) mod batching;
 pub(crate) mod handle;
 pub use handle::*;
 
-// pub(crate) const BATCH_AM_SIZE: usize = 100_000;
+// pub(crate) const am_size_threshold: usize = 100_000;
 
 /// This macro is used to setup the attributed type so that it can be used within remote active messages.
 ///
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index 10a20c79..67962333 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -117,7 +117,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                     self.executor.submit_io_task(async move {
                         //spawn a task so that we can the execute the local am immediately
                         // println!(" {} {} {}, {}, {}",req_data.team.lamellae.backend() != Backend::Local,req_data.team.num_pes() > 1, req_data.team.team_pe_id().is_err(),(req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()),req_data.team.lamellae.backend() != Backend::Local && (req_data.team.num_pes() > 1 || req_data.team.team_pe_id().is_err()) );
-                        if am_size < config().batch_am_size && !immediate {
+                        if am_size < config().am_size_threshold && !immediate {
                             ame.batcher
                                 .add_remote_am_to_batch(
                                     req_data_clone.clone(),
@@ -155,7 +155,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 } else {
                     let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                     let am_size = am.serialized_size();
-                    if am_size < config().batch_am_size && !immediate {
+                    if am_size < config().am_size_threshold && !immediate {
                         self.batcher
                             .add_remote_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                             .await;
@@ -179,7 +179,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 // println!("Am::Return");
                 let am_id = *(AMS_IDS.get(&am.get_id()).unwrap());
                 let am_size = am.serialized_size();
-                if am_size < config().batch_am_size && !immediate {
+                if am_size < config().am_size_threshold && !immediate {
                     self.batcher
                         .add_return_am_to_batch(req_data, am, am_id, am_size, stall_mark)
                         .await;
@@ -197,7 +197,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
             Am::Data(req_data, data) => {
                 // println!("Am::Data");
                 let data_size = data.serialized_size();
-                if data_size < config().batch_am_size && !immediate {
+                if data_size < config().am_size_threshold && !immediate {
                     self.batcher
                         .add_data_am_to_batch(req_data, data, data_size, stall_mark)
                         .await;
@@ -207,7 +207,7 @@ impl ActiveMessageEngine for RegisteredActiveMessages {
                 }
             }
             Am::Unit(req_data) => {
-                if *UNIT_HEADER_LEN < config().batch_am_size && !immediate {
+                if *UNIT_HEADER_LEN < config().am_size_threshold && !immediate {
                     self.batcher
                         .add_unit_am_to_batch(req_data, stall_mark)
                         .await;
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index ac05b10a..91da9262 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -4,6 +4,7 @@ mod rdma;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
+use crate::config;
 use crate::darc::global_rw_darc::{
     GlobalRwDarc, GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
 };
@@ -371,6 +372,17 @@ impl<T: Dist> GlobalLockArray<T> {
     ///
     ///```
     pub fn blocking_read_lock(&self) -> GlobalLockReadGuard<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: GlobalLockArray<T> = self.clone();
         self.block_on(async move {
             GlobalLockReadGuard {
@@ -429,6 +441,17 @@ impl<T: Dist> GlobalLockArray<T> {
     ///
     ///```
     pub fn blocking_write_lock(&self) -> GlobalLockWriteGuard<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: GlobalLockArray<T> = self.clone();
         self.block_on(async move {
             GlobalLockWriteGuard {
@@ -487,6 +510,17 @@ impl<T: Dist> GlobalLockArray<T> {
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
     pub fn blocking_read_local_data(&self) -> GlobalLockLocalData<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: GlobalLockArray<T> = self.clone();
         self.block_on(async move {
             GlobalLockLocalData {
@@ -553,6 +587,17 @@ impl<T: Dist> GlobalLockArray<T> {
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
     pub fn blocking_write_local_data(&self) -> GlobalLockMutLocalData<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: GlobalLockArray<T> = self.clone();
         self.block_on(async move {
             let lock = self_clone.lock.write().await;
@@ -621,6 +666,17 @@ impl<T: Dist> GlobalLockArray<T> {
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
     pub fn blocking_collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: GlobalLockArray<T> = self.clone();
         self.block_on(async move {
             let lock = self_clone.lock.collective_write().await;
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index d7621f89..dbbd7fb6 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -216,7 +216,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             self.buf.len(),
                         ) {
                             let u8_buf_len = len * std::mem::size_of::<T>();
-                            if u8_buf_len > config().batch_am_size {
+                            if u8_buf_len > config().am_size_threshold {
                                 // println!("pe {:?} index: {:?} len {:?} buflen {:?} putting {:?}",pe,self.index,len, self.buf.len(),&u8_buf.as_slice().unwrap()[cur_index..(cur_index+u8_buf_len)]);
                                 let remote_am = GlobalLockRemotePutAm {
                                     array: self.array.clone().into(), //inner of the indices we need to place data into
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index cf127730..bc885bb7 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -6,6 +6,7 @@ mod rdma;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
+use crate::config;
 use crate::darc::local_rw_darc::LocalRwDarc;
 use crate::darc::DarcMode;
 use crate::lamellar_request::LamellarRequest;
@@ -345,6 +346,17 @@ impl<T: Dist> LocalLockArray<T> {
     /// //do interesting work
     ///
     pub fn blocking_read_lock(&self) -> LocalLockReadGuard<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: LocalLockArray<T> = self.clone();
         self.block_on(async move {
             LocalLockReadGuard {
@@ -402,6 +414,17 @@ impl<T: Dist> LocalLockArray<T> {
     /// //do interesting work
     ///
     pub fn blocking_write_lock(&self) -> LocalLockWriteGuard<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: LocalLockArray<T> = self.clone();
         self.block_on(async move {
             LocalLockWriteGuard {
@@ -457,7 +480,17 @@ impl<T: Dist> LocalLockArray<T> {
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
     pub fn blocking_read_local_data(&self) -> LocalLockLocalData<T> {
-        // println!("getting read lock in read_local_local");
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: LocalLockArray<T> = self.clone();
         self.block_on(async move {
             LocalLockLocalData {
@@ -470,7 +503,6 @@ impl<T: Dist> LocalLockArray<T> {
         })
     }
 
-    /// TODO: UPDATE
     /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
     ///
     /// Calling this function will result in a local read lock being captured on the array
@@ -520,7 +552,17 @@ impl<T: Dist> LocalLockArray<T> {
     /// println!("PE{my_pe} data: {local_data:?}");
     ///```
     pub fn blocking_write_local_data(&self) -> LocalLockMutLocalData<T> {
-        // println!("getting write lock in write_local_data");
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+        }
         let self_clone: LocalLockArray<T> = self.clone();
         self.block_on(async move {
             let lock = self_clone.lock.write().await;
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index 5d4328e5..db155cfc 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -2,6 +2,7 @@ use crate::array::iterator::local_iterator::{IndexedLocalIterator, LocalIterator
 use crate::array::iterator::private::*;
 use crate::array::local_lock_atomic::*;
 use crate::array::LamellarArray;
+use crate::config;
 use crate::memregion::Dist;
 
 use std::sync::Arc;
@@ -265,6 +266,16 @@ impl<T: Dist> LocalLockArray<T> {
     ///
     /// ```
     pub fn blocking_read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_chunks` from within an async context which may lead to deadlock, it is recommended that you use `read_local_chunks().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let lock = Arc::new(self.array.block_on(self.lock.read()));
         LocalLockLocalChunks {
             chunk_size,
@@ -326,6 +337,16 @@ impl<T: Dist> LocalLockArray<T> {
     ///
     /// ```
     pub fn blocking_write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_chunks` from within an async context which may lead to deadlock, it is recommended that you use `write_local_chunks().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let lock = Arc::new(self.array.block_on(self.lock.write()));
         LocalLockLocalChunksMut {
             chunk_size,
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 846d7d09..8420027e 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -808,6 +808,17 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T
 
 impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `Array::team_from` from within an async context which may lead to deadlock, this is unintended and likely a Runtime bug.
+                Please open a github issue at https://github.com/pnnl/lamellar-runtime/issues including a backtrace if possible.
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let (local_vals, distribution) = input;
         // println!("local_vals len: {:?}", local_vals.len());
         team.tasking_barrier();
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index b7e9434b..0ef8d9ee 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -487,7 +487,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         index_size: IndexSize,
     ) -> VecDeque<(AmHandle<R>, Vec<usize>)> {
         let num_per_batch =
-            (config().batch_am_size as f32 / index_size.len() as f32).ceil() as usize;
+            (config().am_size_threshold as f32 / index_size.len() as f32).ceil() as usize;
 
         let num_pes = self.inner.data.team.num_pes();
         // let my_pe = self.inner.data.team.my_pe();
@@ -604,7 +604,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         _index_size: IndexSize,
     ) -> VecDeque<(AmHandle<R>, Vec<usize>)> {
         let num_per_batch =
-            (config().batch_am_size as f32 / std::mem::size_of::<T>() as f32).ceil() as usize;
+            (config().am_size_threshold as f32 / std::mem::size_of::<T>() as f32).ceil() as usize;
 
         // println!("multi_val_one_index");
         let cnt = Arc::new(AtomicUsize::new(0));
@@ -687,7 +687,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             IndexSize::U64 => std::mem::size_of::<IdxVal<u64, T>>(),
             IndexSize::Usize => std::mem::size_of::<IdxVal<usize, T>>(),
         };
-        let num_per_batch = (config().batch_am_size as f32 / idx_val_bytes as f32).ceil() as usize;
+        let num_per_batch = (config().am_size_threshold as f32 / idx_val_bytes as f32).ceil() as usize;
         let bytes_per_batch = num_per_batch * idx_val_bytes;
 
         let num_pes = self.inner.data.team.num_pes();
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 2d531925..3019602e 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -87,7 +87,7 @@ impl<T: Dist> UnsafeArray<T> {
                         // unsafe{
                         //     println!("{:?} {:?},",buf.clone().to_base::<u8>().as_slice(), buf.sub_region(buf_index..(buf_index + len)).to_base::<u8>().as_slice());
                         // }
-                        if buf.len() * std::mem::size_of::<T>() > config().batch_am_size {
+                        if buf.len() * std::mem::size_of::<T>() > config().am_size_threshold {
                             let am = UnsafePutAm {
                                 array: self.clone().into(),
                                 start_index: index,
@@ -117,7 +117,7 @@ impl<T: Dist> UnsafeArray<T> {
                         }
                     }
                     ArrayRdmaCmd::GetAm => {
-                        // if buf.len()*std::mem::size_of::<T>() > config().batch_am_size{
+                        // if buf.len()*std::mem::size_of::<T>() > config().am_size_threshold{
                         let am = UnsafeBlockGetAm {
                             array: self.clone().into(),
                             offset: offset,
@@ -211,7 +211,7 @@ impl<T: Dist> UnsafeArray<T> {
                     // println!("{:?}",temp_memreg.clone().to_base::<u8>().as_slice());
                     // println!("si: {:?} ei {:?}",offset,offset+k);
 
-                    if buf.len() * std::mem::size_of::<T>() > config().batch_am_size {
+                    if buf.len() * std::mem::size_of::<T>() > config().am_size_threshold {
                         let am = UnsafePutAm {
                             array: self.clone().into(),
                             start_index: index,
@@ -283,7 +283,7 @@ impl<T: Dist> UnsafeArray<T> {
                 }
             }
             ArrayRdmaCmd::GetAm => {
-                // if buf.len()*std::mem::size_of::<T>() > config().batch_am_size{
+                // if buf.len()*std::mem::size_of::<T>() > config().am_size_threshold{
                 let rem = buf.len() % num_pes;
                 for i in 0..std::cmp::min(buf.len(), num_pes) {
                     let temp_memreg = self
@@ -716,7 +716,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for UnsafeArray<T> {
         buf: U,
     ) -> ArrayRdmaHandle {
         let buf = buf.into();
-        let reqs = if buf.len() * std::mem::size_of::<T>() > config().batch_am_size {
+        let reqs = if buf.len() * std::mem::size_of::<T>() > config().am_size_threshold {
             match self.inner.distribution {
                 Distribution::Block => self.block_op(ArrayRdmaCmd::GetAm, index, buf),
                 Distribution::Cyclic => self.cyclic_op(ArrayRdmaCmd::GetAm, index, buf),
diff --git a/src/barrier.rs b/src/barrier.rs
index 14e10713..9ab3ca7f 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -275,14 +275,14 @@ impl Barrier {
                 self.scheduler.exec_task();
             });
         } else {
-            if let Some(val) = config().barrier_warning {
-                // std::env::var("LAMELLAR_BARRIER_WARNING") {
+            if let Some(val) = config().blocking_call_warning {
+                // std::env::var("LAMELLAR_BLOCKING_CALL_WARNING") {
                 // if val != "0" && val != "false" && val != "no" && val != "off" {
                 if val {
-                    println!("[LAMELLAR WARNING] You are calling barrier from within an async context, this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BARRIER_WARNING=0 to disable this warning");
+                    println!("[LAMELLAR WARNING] You are calling barrier from within an async context, this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning");
                 }
             } else {
-                println!("[LAMELLAR WARNING] You are calling barrier from within an async context), this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BARRIER_WARNING=0 to disable this warning");
+                println!("[LAMELLAR WARNING] You are calling barrier from within an async context), this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning");
             }
             self.tasking_barrier()
         }
diff --git a/src/darc.rs b/src/darc.rs
index ce768f70..1302f32c 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1318,9 +1318,68 @@ impl<T> Darc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = Darc::new(&world,5).expect("PE in world team");
-    /// let five_as_localdarc = five.into_localrw();
+    /// let five_as_localdarc = world.block_on(async move {five.into_localrw().await});
     /// ```
-    pub fn into_localrw(self) -> LocalRwDarc<T> {
+    pub async fn into_localrw(self) -> LocalRwDarc<T> {
+        let inner = self.inner();
+        let _cur_pe = inner.team().world_pe;
+        DarcInner::block_on_outstanding(
+            WrappedInner {
+                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
+            },
+            DarcMode::LocalRw,
+            0,
+        )
+        .await;
+        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
+        inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
+        // println! {"[{:?}] darc[{:?}] into_localrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
+        let item = unsafe { *Box::from_raw(inner.item as *mut T) };
+
+        let d = Darc {
+            inner: self.inner as *mut DarcInner<Arc<RwLock<T>>>,
+            src_pe: self.src_pe,
+        };
+        d.inner_mut()
+            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
+        // d.print();
+        LocalRwDarc { darc: d }
+    }
+
+    #[doc(alias = "Collective")]
+    /// Converts this Darc into a [LocalRwDarc]
+    ///
+    /// This is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+    ///
+    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let five = Darc::new(&world,5).expect("PE in world team");
+    /// let five_as_localdarc = five.blocking_into_localrw();
+    /// ```
+    pub fn blocking_into_localrw(self) -> LocalRwDarc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `Darc::blocking_into_localrw` from within an async context which may lead to deadlock, it is recommended that you use `into_localrw().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("{msg}");
+                }
+            } else {
+                println!("{msg}");
+            }
+        }
         let inner = self.inner();
         let _cur_pe = inner.team().world_pe;
         inner.team().block_on(DarcInner::block_on_outstanding(
@@ -1363,9 +1422,67 @@ impl<T> Darc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = Darc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = five.into_globalrw();
+    /// let five_as_globaldarc = world.block_on(async move {five.into_globalrw().await});
     /// ```
-    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
+    pub async fn into_globalrw(self) -> GlobalRwDarc<T> {
+        let inner = self.inner();
+        let _cur_pe = inner.team().world_pe;
+        DarcInner::block_on_outstanding(
+            WrappedInner {
+                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
+            },
+            DarcMode::GlobalRw,
+            0,
+        )
+        .await;
+        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
+        inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
+        // println! {"[{:?}] darc[{:?}] into_globalrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
+
+        let item = unsafe { Box::from_raw(inner.item as *mut T) };
+        let d = Darc {
+            inner: self.inner as *mut DarcInner<DistRwLock<T>>,
+            src_pe: self.src_pe,
+        };
+        d.inner_mut()
+            .update_item(Box::into_raw(Box::new(DistRwLock::new(
+                *item,
+                self.inner().team(),
+            ))));
+        GlobalRwDarc { darc: d }
+    }
+
+    #[doc(alias = "Collective")]
+    /// Converts this Darc into a [GlobalRwDarc]
+    ///
+    /// This is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+    ///
+    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a GlobalRwDarc and a Darc simultaneously (on any PE).
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let five = Darc::new(&world,5).expect("PE in world team");
+    /// let five_as_globaldarc = five.blocking_into_globalrw();
+    /// ```
+    pub fn blocking_into_globalrw(self) -> GlobalRwDarc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `Darc::blocking_into_globalrw` from within an async context which may lead to deadlock, it is recommended that you use `into_globalrw().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let inner = self.inner();
         let _cur_pe = inner.team().world_pe;
         inner.team().block_on(DarcInner::block_on_outstanding(
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 1c3e7aed..b74c4b49 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -8,6 +8,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
 use crate::active_messaging::RemotePtr;
+use crate::config;
 use crate::darc::local_rw_darc::LocalRwDarc;
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
@@ -758,6 +759,16 @@ impl<T> GlobalRwDarc<T> {
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
     pub fn blocking_read(&self) -> GlobalRwDarcReadGuard<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_read` from within an async context which may lead to deadlock, it is recommended that you use `read().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         // println!("read");
 
         let inner = self.inner();
@@ -812,7 +823,16 @@ impl<T> GlobalRwDarc<T> {
     /// *guard += my_pe;
     ///```
     pub fn blocking_write(&self) -> GlobalRwDarcWriteGuard<T> {
-        // println!("write");
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_write` from within an async context which may lead to deadlock, it is recommended that you use `write().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let inner = self.inner();
         let team = inner.team();
         let remote_rwlock_addr = team.lamellae.remote_addr(
@@ -880,7 +900,16 @@ impl<T> GlobalRwDarc<T> {
     /// world.barrier(); //at this point all updates will have been performed
     ///```
     pub fn blocking_collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-        // println!("async write");
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_collective_write` from within an async context which may lead to deadlock, it is recommended that you use `collective_write().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let inner = self.inner();
         let team = inner.team();
         let remote_rwlock_addr = team.lamellae.remote_addr(
@@ -952,6 +981,49 @@ impl<T> GlobalRwDarc<T> {
     //     })
     // }
 
+    #[doc(alias = "Collective")]
+    /// Converts this GlobalRwDarc into a regular [Darc]
+    ///
+    /// This is a blocking collective call amongst all PEs in the GlobalRwDarc's team, only returning once every PE in the team has completed the call.
+    ///
+    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a Darc and a GlobalRwDarc simultaneously (on any PE).
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_darc = world.block_on(async move {five.into_darc()});
+    /// ```
+    pub async fn into_darc(self) -> Darc<T> {
+        let inner = self.inner();
+        // println!("into_darc");
+        // self.print();
+        DarcInner::block_on_outstanding(
+            WrappedInner {
+                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                    .expect("invalid darc pointer"),
+            },
+            DarcMode::Darc,
+            0,
+        )
+        .await;
+        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
+        let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
+        let d = Darc {
+            inner: self.darc.inner as *mut DarcInner<T>,
+            src_pe: self.darc.src_pe,
+            // phantom: PhantomData,
+        };
+        d.inner_mut().update_item(Box::into_raw(Box::new(item)));
+        d
+    }
     #[doc(alias = "Collective")]
     /// Converts this GlobalRwDarc into a regular [Darc]
     ///
@@ -972,7 +1044,17 @@ impl<T> GlobalRwDarc<T> {
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_darc = five.into_darc();
     /// ```
-    pub fn into_darc(self) -> Darc<T> {
+    pub fn blocking_into_darc(self) -> Darc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_into_darc` from within an async context which may lead to deadlock, it is recommended that you use `into_darc().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -995,6 +1077,51 @@ impl<T> GlobalRwDarc<T> {
         d
     }
 
+    #[doc(alias = "Collective")]
+    /// Converts this GlobalRwDarc into a [LocalRwDarc]
+    ///
+    /// This is a blocking collective call amongst all PEs in the GlobalRwDarc's team, only returning once every PE in the team has completed the call.
+    ///
+    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_localdarc = world.block_on(async move {five.into_localrw()});
+    /// ```
+    pub async fn into_localrw(self) -> LocalRwDarc<T> {
+        let inner = self.inner();
+        // println!("into_localrw");
+        // self.print();
+        DarcInner::block_on_outstanding(
+            WrappedInner {
+                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                    .expect("invalid darc pointer"),
+            },
+            DarcMode::LocalRw,
+            0,
+        )
+        .await;
+        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
+        let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
+        let d = Darc {
+            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<T>>>,
+            src_pe: self.darc.src_pe,
+            // phantom: PhantomData,
+        };
+        d.inner_mut()
+            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
+        LocalRwDarc { darc: d }
+    }
+
     #[doc(alias = "Collective")]
     /// Converts this GlobalRwDarc into a [LocalRwDarc]
     ///
@@ -1015,7 +1142,17 @@ impl<T> GlobalRwDarc<T> {
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_localdarc = five.into_localrw();
     /// ```
-    pub fn into_localrw(self) -> LocalRwDarc<T> {
+    pub fn blocking_into_localrw(self) -> LocalRwDarc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_into_localrw` from within an async context which may lead to deadlock, it is recommended that you use `into_localrw().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let inner = self.inner();
         // println!("into_localrw");
         // self.print();
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 978ada91..379ffb80 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -10,6 +10,7 @@ use std::sync::atomic::Ordering;
 use std::sync::Arc;
 
 use crate::active_messaging::RemotePtr;
+use crate::config;
 use crate::darc::global_rw_darc::{DistRwLock, GlobalRwDarc};
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
@@ -166,6 +167,16 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
     pub fn blocking_read(&self) -> RwLockReadGuardArc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_read` from within an async context which may lead to deadlock, it is recommended that you use `read().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let self_clone: LocalRwDarc<T> = self.clone();
         self.darc
             .team()
@@ -173,7 +184,6 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
     /// Aquires a reader lock of this LocalRwDarc local to this PE.
     ///
     /// The current THREAD will be blocked until the lock has been acquired.
@@ -263,6 +273,16 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// **guard += my_pe;
     ///```
     pub fn blocking_write(&self) -> RwLockWriteGuardArc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_write` from within an async context which may lead to deadlock, it is recommended that you use `write().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         // println!("trying to get write lock");
         let self_clone: LocalRwDarc<T> = self.clone();
         self.darc
@@ -375,9 +395,74 @@ impl<T> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = five.into_globalrw();
+    /// let five_as_globaldarc = world.block_on(async move {five.into_globalrw().await});
+    /// ```
+    pub async fn into_globalrw(self) -> GlobalRwDarc<T> {
+        let inner = self.inner();
+        // println!("into_darc");
+        // self.print();
+        DarcInner::block_on_outstanding(
+            WrappedInner {
+                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                    .expect("invalid darc pointer"),
+            },
+            DarcMode::GlobalRw,
+            0,
+        )
+        .await;
+        // println!("after block on outstanding");
+        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+        let item: T = loop {
+            arc_item = match Arc::try_unwrap(arc_item) {
+                Ok(item) => break item.into_inner(),
+                Err(arc_item) => arc_item,
+            };
+        };
+        let d = Darc {
+            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
+            src_pe: self.darc.src_pe,
+            // phantom: PhantomData,
+        };
+        d.inner_mut()
+            .update_item(Box::into_raw(Box::new(DistRwLock::new(
+                item,
+                self.inner().team(),
+            ))));
+        GlobalRwDarc { darc: d }
+    }
+
+    #[doc(alias = "Collective")]
+    /// Converts this LocalRwDarc into a [GlobalRwDarc]
+    ///
+    /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
+    ///
+    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_globaldarc = five.blocking_into_globalrw();
     /// ```
-    pub fn into_globalrw(self) -> GlobalRwDarc<T> {
+    pub fn blocking_into_globalrw(self) -> GlobalRwDarc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_into_globalrw` from within an async context which may lead to deadlock, it is recommended that you use `into_globalrw().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
@@ -431,9 +516,72 @@ impl<T: Send + Sync> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = five.into_darc();
+    /// let five_as_darc = world.block_on(async move {five.into_darc()});
+    /// ```
+    pub async fn into_darc(self) -> Darc<T> {
+        let inner = self.inner();
+        // println!("into_darc");
+        // self.print();
+        DarcInner::block_on_outstanding(
+            WrappedInner {
+                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                    .expect("invalid darc pointer"),
+            },
+            DarcMode::Darc,
+            0,
+        )
+        .await;
+        // println!("after block on outstanding");
+        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
+                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
+        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
+
+        let item: T = loop {
+            arc_item = match Arc::try_unwrap(arc_item) {
+                Ok(item) => break item.into_inner(),
+                Err(arc_item) => arc_item,
+            };
+        };
+        let d = Darc {
+            inner: self.darc.inner as *mut DarcInner<T>,
+            src_pe: self.darc.src_pe,
+            // phantom: PhantomData,
+        };
+        d.inner_mut().update_item(Box::into_raw(Box::new(item))); //the darc will free this approriately
+        d
+    }
+
+    #[doc(alias = "Collective")]
+    /// Converts this LocalRwDarc into a regular [Darc]
+    ///
+    /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
+    ///
+    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_darc = five.blocking_into_darc();
     /// ```
-    pub fn into_darc(self) -> Darc<T> {
+    pub fn blocking_into_darc(self) -> Darc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_into_darc` from within an async context which may lead to deadlock, it is recommended that you use `into_darc().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         let inner = self.inner();
         // println!("into_darc");
         // self.print();
diff --git a/src/env_var.rs b/src/env_var.rs
index 87f6b2d4..66519df8 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -1,6 +1,47 @@
-use std::sync::OnceLock;
+//! Lamellar uses a number of environment variables to configure its behavior
+//! the following variables are supported along with a breif description and default value
+//!
+//! - `LAMELLAR_BACKEND` - the backend used during execution. Note that if a backend is explicitly set in the world builder, this variable is ignored.
+//!     - possible values
+//!         - `local` -- default (if `enable-local` feature is not active)
+//!         - `shmem`
+//!         - `rofi`  -- only available with the `enable-rofi` feature in which case it is the default backend
+//! - `LAMELLAR_EXECUTOR` - the executor used during execution. Note that if a executor is explicitly set in the world builder, this variable is ignored.
+//!     - possible values
+//!         - `lamellar` -- default, work stealing backend
+//!         - `async_std` -- alternative backend from async_std
+//!         - `tokio` -- only available with the `tokio-executor` feature in which case it is the default executor
+//! - `LAMELLAR_BATCHER` - selects how small active messages are batched for remote operations
+//!     - possible values
+//!         - `simple` -- default, active messages are only batched based on the PE they are sent to
+//!         - `team_am` -- active messages are batched heirarchically based on the remote PE, team sending the message, and AM id
+//! - `LAMELLAR_THREADS` - The number of worker threads used within a lamellar PE, defaults to [std::thread::available_parallelism] if available or else 4
+//! - `LAMELLAR_HEAP_SIZE` - Specify the initial size of the Runtime "RDMAable" memory pool. Defaults to 4GB
+//!     - Internally, Lamellar utilizes memory pools of RDMAable memory for Runtime data structures (e.g. [Darcs][crate::Darc],
+//!       [OneSidedMemoryRegion][crate::memregion::OneSidedMemoryRegion],etc), aggregation buffers, and message queues.
+//!     - Note: when running multiple PEs on a single system, the total allocated memory for the pools would be equal to `LAMELLAR_HEAP_SIZE * number of processes`
+//! - `LAMELLAR_HEAP_MODE` - Specify whether the heap will be allocated statically or dynamically
+//!     - possible values
+//!         - `static`
+//!         - `dynamic` -- default, Additional memory pools are dynamically allocated across the system as needed.
+//!           This can be a fairly expensive operation (as the operation is synchronous across all PEs) so the runtime
+//!           will print a message at the end of execution with how many additional pools were allocated.
+//!              - if you find you are dynamically allocating new memory pools, try setting `LAMELLAR_HEAP_SIZE` to a larger value
+//! - `LAMELLAR_DEADLOCK_TIMEOUT` - the timeout in seconds before a deadlock warning is printed. Defaults to 600
+//! - `LAMELLAR_AM_GROUP_BATCH_SIZE` - The maximum number of sub messages that will be sent in a single AMGroup Active Message, default: 10000
+//! - `LAMELLAR_BLOCKING_CALL_WARNING` - flag used to print warnings when users call barriers on worker threads. Default: true
+//! - `LAMELLAR_BARRIER_DISSEMINATION_FACTOR` - (Experimental) The dissemination factor for the n-way barrier, default: 2
+//! - `LAMELLAR_BATCH_OP_THREADS` - the number of threads used to initiate batched operations, defaults to 1/4 LAMELLAR_THREADS
+//! - `LAMELLAR_ARRAY_INDEX_SIZE` - specify static or dynamic array index size
+//!     - possible values
+//!         - `static` -- constant usize indices
+//!         - `dynamic` -- default, only uses as large an int as necessary to index the array, bounded by themax number of elements on any PE.
+//! - `LAMELLAR_AM_SIZE_THRESHOLD` - the threshold for an activemessage (in bytes) on whether it will be sent directly or aggregated, default: 100000
+//! - `LAMELLAR_ROFI_PROVIDER` - the provider for the rofi backend (only used with the rofi backend), default: "verbs"
+//! - `LAMELLAR_ROFI_DOMAIN` - the domain for the rofi backend (only used with the rofi backend), default: ""
 
 use serde::Deserialize;
+use std::sync::OnceLock;
 
 fn default_deadlock_timeout() -> f64 {
     600.0
@@ -39,6 +80,7 @@ fn default_threads() -> usize {
     }
 }
 
+#[doc(hidden)]
 #[derive(Deserialize, Debug, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum HeapMode {
@@ -50,6 +92,7 @@ fn default_heap_mode() -> HeapMode {
     HeapMode::Dynamic
 }
 
+#[doc(hidden)]
 #[derive(Deserialize, Debug, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Alloc {
@@ -61,6 +104,7 @@ fn default_alloc() -> Alloc {
     Alloc::Heap
 }
 
+#[doc(hidden)]
 #[derive(Deserialize, Debug, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum IndexType {
@@ -79,7 +123,7 @@ fn default_cmd_buf_cnt() -> usize {
     2
 }
 
-fn default_batch_am_size() -> usize {
+fn default_am_size_threshold() -> usize {
     100000
 }
 
@@ -91,6 +135,7 @@ fn default_rofi_domain() -> String {
     "".to_owned()
 }
 
+#[doc(hidden)]
 #[derive(Deserialize, Debug)]
 pub struct Config {
     /// A general timeout in seconds for various operations which may indicate a deadlock, default: 600.0 seconds
@@ -106,7 +151,7 @@ pub struct Config {
     pub barrier_dissemination_factor: usize,
 
     /// flag used to print warnings when users call barriers on worker threads. Default: true
-    pub barrier_warning: Option<bool>,
+    pub blocking_call_warning: Option<bool>,
 
     /// The lamellae backend to use
     /// rofi -- multi pe distributed execution, default if rofi feature is turned on
@@ -132,18 +177,23 @@ pub struct Config {
     pub alloc: Alloc,
     #[serde(default = "default_array_dynamic_index")]
     pub index_size: IndexType,
+
+    //used internally by the command queues
     #[serde(default = "default_cmd_buf_len")]
     pub cmd_buf_len: usize,
+    //used internally by the command queues
     #[serde(default = "default_cmd_buf_cnt")]
     pub cmd_buf_cnt: usize,
-    #[serde(default = "default_batch_am_size")]
-    pub batch_am_size: usize, //the threshold for an activemessage (in bytes) on whether it will be sent directly or aggregated
+
+    #[serde(default = "default_am_size_threshold")]
+    pub am_size_threshold: usize, //the threshold for an activemessage (in bytes) on whether it will be sent directly or aggregated
     #[serde(default = "default_rofi_provider")]
     pub rofi_provider: String,
     #[serde(default = "default_rofi_domain")]
     pub rofi_domain: String,
 }
 
+#[doc(hidden)]
 /// Get the current Environment Variable configuration
 pub fn config() -> &'static Config {
     static CONFIG: OnceLock<Config> = OnceLock::new();
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 72bbf946..498a2671 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -652,10 +652,25 @@ impl LamellarTaskGroup {
     }
 
     fn wait_all(&self) {
+        let mut exec_task = true;
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+            exec_task = false;
+        }
         let mut temp_now = Instant::now();
         while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
             // self.team.flush();
-            self.team.scheduler.exec_task();
+            if exec_task {
+                self.team.scheduler.exec_task();
+            }
             if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "in task group wait_all mype: {:?} cnt: {:?} {:?}",
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 7586e295..baa5b98e 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1343,6 +1343,19 @@ impl LamellarTeamRT {
 
     //#[tracing::instrument(skip_all)]
     pub(crate) fn wait_all(&self) {
+        let mut exec_task = true;
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            if let Some(val) = config().blocking_call_warning {
+                if val {
+                    println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                }
+            } else {
+                println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+            }
+            exec_task = false;
+        }
         let mut temp_now = Instant::now();
         while self.panic.load(Ordering::SeqCst) == 0
             && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
@@ -1351,7 +1364,9 @@ impl LamellarTeamRT {
         {
             // std::thread::yield_now();
             // self.flush();
-            self.scheduler.exec_task(); //mmight as well do useful work while we wait
+            if exec_task {
+                self.scheduler.exec_task()
+            }; //mmight as well do useful work while we wait }
             if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
                     "in team wait_all mype: {:?} cnt: {:?} {:?}",
diff --git a/src/lib.rs b/src/lib.rs
index eea4d2d9..318f8119 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -61,6 +61,11 @@
 //!
 //! Additional information on using each of the lamellae backends can be found below in the `Running Lamellar Applications` section
 //!
+//! Environment Variables
+//! ---------------------
+//! Lamellar has a number of environment variables that can be used to configure the runtime.
+//! please see the [Environment Variables][crate::env_var] module documentation for more details
+//!
 //! Examples
 //! --------
 //! Our repository also provides numerous examples highlighting various features of the runtime: <https://github.com/pnnl/lamellar-runtime/tree/master/examples>
@@ -214,21 +219,6 @@
 //!     - ```srun -N 2 -mpi=pmi2 ./target/release/<appname>```
 //!         - `pmi2` library is required to grab info about the allocated nodes and helps set up initial handshakes
 //!
-//! # Environment Variables
-//! Lamellar exposes a number of environment variables that can used to control application execution at runtime
-//! - `LAMELLAR_THREADS` - The number of worker threads used within a lamellar PE
-//!     -  `export LAMELLAR_THREADS=10`
-//! - `LAMELLAE_BACKEND` - the backend used during execution. Note that if a backend is explicitly set in the world builder, this variable is ignored.
-//!     - possible values
-//!         - `local`
-//!         - `shmem`
-//!         - `rofi`
-//! - `LAMELLAR_MEM_SIZE` - Specify the initial size of the Runtime "RDMAable" memory pool. Defaults to 1GB
-//!     - `export LAMELLAR_MEM_SIZE=$((20*1024*1024*1024))` 20GB memory pool
-//!     - Internally, Lamellar utilizes memory pools of RDMAable memory for Runtime data structures (e.g. [Darcs][crate::Darc], [OneSidedMemoryRegion],etc), aggregation buffers, and message queues. Additional memory pools are dynamically allocated across the system as needed. This can be a fairly expensive operation (as the operation is synchronous across all PEs) so the runtime will print a message at the end of execution with how many additional pools were allocated.
-//!         - if you find you are dynamically allocating new memory pools, try setting `LAMELLAR_MEM_SIZE` to a larger value
-//!     - Note: when running multiple PEs on a single system, the total allocated memory for the pools would be equal to `LAMELLAR_MEM_SIZE * number of processes`
-//!
 
 #[macro_use]
 extern crate lazy_static;
@@ -283,7 +273,7 @@ mod utils;
 //#[doc(hidden)]
 pub use utils::*;
 
-mod env_var;
+pub mod env_var;
 pub use env_var::config;
 
 pub use crate::lamellae::Backend;

From 24cd21eea53dc09c2fef2c856b847aa1a2621340 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Jul 2024 15:11:31 -0700
Subject: [PATCH 047/116] created blocking versions of sum,min,max,etc

---
 src/array/atomic.rs             | 258 ++++++++++++++++++++++++++++++++
 src/array/global_lock_atomic.rs | 150 +++++++++++++++++++
 src/array/local_lock_atomic.rs  | 161 ++++++++++++++++++++
 src/array/read_only.rs          | 146 ++++++++++++++++++
 src/array/unsafe.rs             | 183 +++++++++++++++++++++-
 5 files changed, 895 insertions(+), 3 deletions(-)

diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index afa4ae27..89578d00 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -6,6 +6,7 @@ use crate::array::generic_atomic::{GenericAtomicElement, LocalGenericAtomicEleme
 use crate::array::native_atomic::NativeAtomicElement;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
+use crate::config;
 // use crate::darc::{Darc, DarcMode};
 use crate::lamellar_team::IntoLamellarTeam;
 use crate::memregion::Dist;
@@ -1211,6 +1212,61 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
             AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
         }
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    ///  # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let sum = array.blocking_reduce("sum"); // equivalent to calling array.sum()
+    /// assert_eq!(array.len()*num_pes,sum);
+    ///```
+    pub fn blocking_reduce(&self, reduction: &str) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.block_on(match self {
+            AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
+            AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
+        })
+    }
 }
 
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
@@ -1257,6 +1313,59 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
         }
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    ///  # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let sum = array.blocking_sum();
+    /// assert_eq!(array.len()*num_pes,sum);
+    /// ```
+    pub fn blocking_sum(&self) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_sum` from within an async context which may lead to deadlock, it is recommended that you use `sum().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.block_on(match self {
+            AtomicArray::NativeAtomicArray(array) => array.sum(),
+            AtomicArray::GenericAtomicArray(array) => array.sum(),
+        })
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1298,6 +1407,58 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
             AtomicArray::GenericAtomicArray(array) => array.prod(),
         }
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
+    ///     elem.store(i+1);
+    /// });
+    /// array.wait_all();
+    /// array.barrier();
+    /// let prod =  array.blocking_prod();
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
+    pub fn blocking_prod(&self) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_prod` from within an async context which may lead to deadlock, it is recommended that you use `prod().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.block_on(match self {
+            AtomicArray::NativeAtomicArray(array) => array.prod(),
+            AtomicArray::GenericAtomicArray(array) => array.prod(),
+        })
+    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1337,6 +1498,55 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
             AtomicArray::GenericAtomicArray(array) => array.max(),
         }
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let max = array.blocking_max();
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
+    pub fn blocking_max(&self) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_max` from within an async context which may lead to deadlock, it is recommended that you use `max().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.block_on(match self {
+            AtomicArray::NativeAtomicArray(array) => array.max(),
+            AtomicArray::GenericAtomicArray(array) => array.max(),
+        })
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1374,6 +1584,54 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
             AtomicArray::GenericAtomicArray(array) => array.min(),
         }
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
+    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
+    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
+    /// it may not be your desired behavior.
+    ///
+    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
+    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
+    /// but performing a reduction could result in safe but non deterministic results.
+    ///
+    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let min = array.blocking_min();
+    /// assert_eq!(0,min);
+    ///```
+    pub fn blocking_min(&self) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_min` from within an async context which may lead to deadlock, it is recommended that you use `min().await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.block_on(match self {
+            AtomicArray::NativeAtomicArray(array) => array.min(),
+            AtomicArray::GenericAtomicArray(array) => array.min(),
+        })
+    }
 }
 
 impl<T: Dist> LamellarWrite for AtomicArray<T> {}
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 91da9262..6ebd5a6b 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -1230,6 +1230,48 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
             lock_guard: self.lock_guard.clone(),
         }
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = read_guard.blocking_reduce("prod");
+    ///```
+    pub fn blocking_reduce(self, op: &str) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.array.block_on(GlobalLockArrayReduceHandle {
+            req: self.array.array.reduce_data(op, self.array.clone().into()),
+            lock_guard: self.lock_guard.clone(),
+        })
+    }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1259,6 +1301,33 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
         self.reduce("sum")
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let sum = read_guard.blocking_sum();
+    /// ```
+    pub fn blocking_sum(self) -> Option<T> {
+        self.blocking_reduce("sum")
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1285,6 +1354,33 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     pub fn prod(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = read_guard.blocking_prod();
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
+    pub fn blocking_prod(self) -> Option<T> {
+        self.blocking_reduce("prod")
+    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuard<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1314,6 +1410,33 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
         self.reduce("max")
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let max = read_guard.blocking_max();
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
+    pub fn blocking_max(self) -> Option<T> {
+        self.blocking_reduce("max")
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1340,6 +1463,33 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     pub fn min(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let min = read_guard.blocking_min();
+    /// assert_eq!(0,min);
+    ///```
+    pub fn blocking_min(self) -> Option<T> {
+        self.blocking_reduce("min")
+    }
 }
 
 // impl<T: Dist + serde::ser::Serialize + serde::de::DeserializeOwned + 'static> LamellarArrayReduce<T>
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index bc885bb7..6be407b9 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -1117,6 +1117,52 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
             lock_guard: self.lock_guard.clone(),
         }
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// Lamellar converting to a [ReadOnlyArray] or [GlobalLockArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = read_guard.blocking_reduce("prod");
+    ///```
+    pub fn blocking_reduce(self, op: &str) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.array.block_on(LocalLockArrayReduceHandle {
+            req: self.array.array.reduce_data(op, self.array.clone().into()),
+            lock_guard: self.lock_guard.clone(),
+        })
+    }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1148,6 +1194,34 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
         self.reduce("sum")
     }
     #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let sum = read_guard.blocking_sum();
+    /// ```
+    pub fn blocking_sum(self) -> Option<T> {
+        self.blocking_reduce("sum")
+    }
+    #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
     /// This equivalent to `reduce("prod")`.
@@ -1175,6 +1249,35 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     pub fn prod(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let prod = read_guard.blocking_prod();
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
+    pub fn blocking_prod(self) -> Option<T> {
+        self.blocking_reduce("prod")
+    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1205,6 +1308,35 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     pub fn max(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("max")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let max = read_guard.blocking_max();
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
+    pub fn blocking_max(self) -> Option<T> {
+        self.blocking_reduce("max")
+    }
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1233,6 +1365,35 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     pub fn min(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Safety
+    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
+    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
+    /// Remote data can change before and after the overall operation has completed.
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// let read_guard = array.blocking_read_lock();
+    /// let min = read_guard.blocking_min();
+    /// assert_eq!(0,min);
+    ///```
+    pub fn blocking_min(self) -> Option<T> {
+        self.blocking_reduce("min")
+    }
 }
 
 // impl<T: Dist + serde::ser::Serialize + serde::de::DeserializeOwned + 'static> LamellarArrayReduce<T>
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 9c58a99f..210fbf67 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -4,6 +4,7 @@ pub use local_chunks::ReadOnlyLocalChunks;
 mod rdma;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
+use crate::config;
 use crate::darc::DarcMode;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
@@ -520,6 +521,47 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let sum = array.blocking_reduce("sum"); // equivalent to calling array.sum()
+    /// assert_eq!(array.len()*num_pes,sum);
+    ///```
+    pub fn blocking_reduce(&self, op: &str) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `ReadOnlyArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.block_on(self.array.reduce_data(op, self.clone().into()))
+    }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -551,6 +593,35 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
         self.reduce("sum")
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// let req = array.local_iter().for_each(move |_| {
+    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    /// });
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let sum = array.blocking_sum();
+    /// assert_eq!(array.len()*num_pes,sum);
+    /// ```
+    pub fn blocking_sum(&self) -> Option<T> {
+        self.blocking_reduce("sum")
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -577,6 +648,33 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
+    ///     elem.store(i+1);
+    /// });
+    /// array.wait_all();
+    /// array.barrier();
+    /// let prod =  array.blocking_prod();
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
+    pub fn blocking_prod(&self) -> Option<T> {
+        self.blocking_reduce("prod")
+    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -603,6 +701,30 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
         self.reduce("max")
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let max = array.blocking_max();
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
+    pub fn blocking_max(&self) -> Option<T> {
+        self.blocking_reduce("max")
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -626,6 +748,30 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let min = array.blocking_min();
+    /// assert_eq!(0,min);
+    ///```
+    pub fn blocking_min(&self) -> Option<T> {
+        self.blocking_reduce("min")
+    }
 }
 
 impl<T: Dist> private::ArrayExecAm<T> for ReadOnlyArray<T> {
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 8420027e..28e07ab1 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1240,6 +1240,53 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
         self.reduce_data(op, self.clone().into())
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// Please see the documentation for the [register_reduction][lamellar_impl::register_reduction] procedural macro for
+    /// more details and examples on how to create your own reductions.
+    ///
+    /// # Safety
+    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
+    /// Any updates to local data are not guaranteed to be Atomic.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
+    /// the returned reduction result is only available on the calling PE  
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
+    ///     let req = array.local_iter().for_each(move |_| {
+    ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///        array_clone.add(index,1); //randomly at one to an element in the array.
+    ///     });
+    /// }
+    /// array.wait_all();
+    /// array.barrier();
+    /// let sum = array.blocking_reduce("sum"); // equivalent to calling array.sum()
+    /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
+    ///```
+    pub unsafe fn blocking_reduce(&self, op: &str) -> Option<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `UnsafeArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+        self.block_on(self.reduce_data(op, self.clone().into()))
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1276,6 +1323,41 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
         self.reduce("sum")
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("sum")`.
+    ///
+    /// # Safety
+    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
+    /// Any updates to local data are not guaranteed to be Atomic.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
+    /// the returned sum reduction result is only available on the calling PE  
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = UnsafeArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
+    ///     let req = array.local_iter().for_each(move |_| {
+    ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
+    ///        array_clone.add(index,1); //randomly at one to an element in the array.
+    ///     });
+    /// }
+    /// array.wait_all();
+    /// array.barrier();
+    /// let sum = unsafe{array.blocking_sum()};
+    ///```
+    pub unsafe fn blocking_sum(&self) -> Option<T> {
+        self.blocking_reduce("sum")
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1304,15 +1386,50 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// array.print();
     /// array.wait_all();
     /// array.print();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// array.print();
-    /// let prod =  array.block_on(array.prod());
+    /// let prod = unsafe{ array.block_on(array.prod())};
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
     pub unsafe fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
+    ///
+    /// This equivalent to `reduce("prod")`.
+    ///
+    /// # Safety
+    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
+    /// Any updates to local data are not guaranteed to be Atomic.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
+    /// the returned prod reduction result is only available on the calling PE  
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// use rand::Rng;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
+    /// unsafe {
+    ///     let req = array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
+    ///         *elem = i+1;
+    ///     });
+    /// }
+    /// array.print();
+    /// array.wait_all();
+    /// array.print();
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// array.print();
+    /// let prod =  unsafe{array.blocking_prod()};
+    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
+    ///```
+    pub unsafe fn blocking_prod(&self) -> Option<T> {
+        self.blocking_reduce("prod")
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Find the max element in the entire destributed array, returning to the calling PE
     ///
@@ -1344,6 +1461,36 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
         self.reduce("max")
     }
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the max element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("max")`.
+    ///
+    /// # Safety
+    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
+    /// Any updates to local data are not guaranteed to be Atomic.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
+    /// the returned max reduction result is only available on the calling PE  
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2)}; //safe as we are accessing in a data parallel fashion
+    /// array.wait_all();
+    /// array.barrier();
+    /// let max = unsafe{array.blocking_max()};
+    /// assert_eq!((array.len()-1)*2,max);
+    ///```
+    pub unsafe fn blocking_max(&self) -> Option<T> {
+        self.blocking_reduce("max")
+    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1374,6 +1521,36 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     pub unsafe fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Find the min element in the entire destributed array, returning to the calling PE
+    ///
+    /// This equivalent to `reduce("min")`.
+    ///
+    /// # Safety
+    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
+    /// Any updates to local data are not guaranteed to be Atomic.
+    ///
+    /// # One-sided Operation
+    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
+    /// the returned min reduction result is only available on the calling PE  
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let num_pes = world.num_pes();
+    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array_clone = array.clone();
+    /// unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2)}; //safe as we are accessing in a data parallel fashion
+    /// array.wait_all();
+    /// array.barrier();
+    /// let min = unsafe{array.blocking_min()};
+    /// assert_eq!(0,min);
+    ///```
+    pub unsafe fn blocking_min(&self) -> Option<T> {
+        self.blocking_reduce("min")
+    }
 }
 
 impl UnsafeArrayInnerWeak {

From 2d33a12d8c673392e271368153752d7b33f75257 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Jul 2024 15:12:35 -0700
Subject: [PATCH 048/116] RDMA handles now include reference to array to
 prevent early dropping

---
 src/array/generic_atomic/rdma.rs     |  7 ++++++-
 src/array/global_lock_atomic/rdma.rs |  5 +++++
 src/array/handle.rs                  |  3 +++
 src/array/local_lock_atomic/rdma.rs  |  7 ++++++-
 src/array/native_atomic/rdma.rs      |  7 ++++++-
 src/array/unsafe/rdma.rs             | 15 ++++++++++++---
 6 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 842db995..355845ca 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -1,7 +1,7 @@
 use std::collections::VecDeque;
 
 use crate::array::generic_atomic::*;
-use crate::array::private::ArrayExecAm;
+use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
 use crate::array::LamellarWrite;
 use crate::array::*;
 use crate::memregion::{AsBase, Dist, RTMemoryRegionRDMA, RegisteredMemoryRegion};
@@ -18,6 +18,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -29,6 +30,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
+            _array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -44,6 +46,7 @@ impl<T: Dist> LamellarArrayGet<T> for GenericAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -65,6 +68,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GenericAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -79,6 +83,7 @@ impl<T: Dist> LamellarArrayPut<T> for GenericAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index dbbd7fb6..9fc9110e 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -28,6 +28,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -39,6 +40,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
+            _array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -54,6 +56,7 @@ impl<T: Dist> LamellarArrayGet<T> for GlobalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -75,6 +78,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GlobalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -89,6 +93,7 @@ impl<T: Dist> LamellarArrayPut<T> for GlobalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/handle.rs b/src/array/handle.rs
index b2c24ecd..210058d1 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -9,12 +9,14 @@ use pin_project::pin_project;
 
 use crate::{
     active_messaging::{AmHandle, LocalAmHandle},
+    array::LamellarByteArray,
     lamellar_request::LamellarRequest,
     Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
 };
 
 /// a task handle for an array rdma (put/get) operation
 pub struct ArrayRdmaHandle {
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<AmHandle<()>>,
 }
 
@@ -54,6 +56,7 @@ impl Future for ArrayRdmaHandle {
 /// a task handle for an array rdma 'at' operation
 #[pin_project]
 pub struct ArrayRdmaAtHandle<T: Dist> {
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: Option<LocalAmHandle<()>>,
     pub(crate) buf: OneSidedMemoryRegion<T>,
 }
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index cc5d7bc8..5c98dbf0 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -1,7 +1,7 @@
 use std::collections::VecDeque;
 
 use crate::array::local_lock_atomic::*;
-use crate::array::private::ArrayExecAm;
+use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
 use crate::array::LamellarWrite;
 use crate::array::*;
 use crate::memregion::{AsBase, Dist, RTMemoryRegionRDMA, RegisteredMemoryRegion};
@@ -21,6 +21,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -32,6 +33,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
+            _array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -47,6 +49,7 @@ impl<T: Dist> LamellarArrayGet<T> for LocalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -68,6 +71,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for LocalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -82,6 +86,7 @@ impl<T: Dist> LamellarArrayPut<T> for LocalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index 3af2f45d..b9093c5d 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -1,7 +1,7 @@
 use std::collections::VecDeque;
 
 use crate::array::native_atomic::*;
-use crate::array::private::ArrayExecAm;
+use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
 use crate::array::LamellarWrite;
 use crate::array::*;
 use crate::memregion::{AsBase, Dist, RTMemoryRegionRDMA, RegisteredMemoryRegion};
@@ -18,6 +18,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -29,6 +30,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
+            _array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -43,6 +45,7 @@ impl<T: Dist> LamellarArrayGet<T> for NativeAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -64,6 +67,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for NativeAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -78,6 +82,7 @@ impl<T: Dist> LamellarArrayPut<T> for NativeAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 3019602e..142a6149 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -1,6 +1,6 @@
 use std::collections::VecDeque;
 
-use crate::array::private::ArrayExecAm;
+use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
 use crate::array::r#unsafe::*;
 use crate::array::*;
 use crate::memregion::{
@@ -631,6 +631,7 @@ impl<T: Dist> UnsafeArray<T> {
         match buf.team_try_into(&self.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -640,6 +641,7 @@ impl<T: Dist> UnsafeArray<T> {
         let buf: OneSidedMemoryRegion<T> = self.team_rt().alloc_one_sided_mem_region(1);
         self.blocking_get(index, &buf);
         ArrayRdmaAtHandle {
+            _array: self.as_lamellar_byte_array(),
             req: None,
             buf: buf,
         }
@@ -731,7 +733,10 @@ impl<T: Dist> LamellarArrayInternalGet<T> for UnsafeArray<T> {
             reqs.push_back(req.into());
             reqs
         };
-        ArrayRdmaHandle { reqs: reqs }
+        ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
+            reqs: reqs,
+        }
     }
 
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
@@ -749,7 +754,10 @@ impl<T: Dist> LamellarArrayInternalPut<T> for UnsafeArray<T> {
             Distribution::Block => self.block_op(ArrayRdmaCmd::PutAm, index, buf.into()),
             Distribution::Cyclic => self.cyclic_op(ArrayRdmaCmd::PutAm, index, buf.into()),
         };
-        ArrayRdmaHandle { reqs: reqs }
+        ArrayRdmaHandle {
+            _array: self.as_lamellar_byte_array(),
+            reqs: reqs,
+        }
     }
 }
 
@@ -762,6 +770,7 @@ impl<T: Dist> LamellarArrayPut<T> for UnsafeArray<T> {
         match buf.team_try_into(&self.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
+                _array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }

From c08eb3e19daac8e8ff1de0ed81e0a783234e5894 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 17 Jul 2024 15:13:15 -0700
Subject: [PATCH 049/116] cleanup commented code

---
 src/array/operations.rs | 95 -----------------------------------------
 1 file changed, 95 deletions(-)

diff --git a/src/array/operations.rs b/src/array/operations.rs
index 202aaa10..585a3f6e 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -462,13 +462,6 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a [T] {
                 Some(n) => n,
                 None => std::cmp::max(1, config().threads / 4),
             }
-            // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-            //     Ok(n) => n.parse::<usize>().unwrap(),
-            //     Err(_) => match std::env::var("LAMELLAR_THREADS") {
-            //         Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-            //         Err(_) => 4,
-            //     },
-            // }
         };
         let num_per_batch = len / num;
         for i in 0..num {
@@ -526,15 +519,6 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a mut [T] {
                 Some(n) => n,
                 None => std::cmp::max(1, config().threads / 4),
             }
-            // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-            //     Ok(n) => n.parse::<usize>().unwrap(),
-            //     Err(_) => {
-            //         match std::env::var("LAMELLAR_THREADS") {
-            //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-            //             Err(_) => 4, //+ 1 to account for main thread
-            //         }
-            //     }
-            // }
         };
         let num_per_batch = len / num;
         for i in 0..num {
@@ -566,33 +550,6 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a mut Vec<T> {
     }
 }
 
-// impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn as_op_input(mut self) -> (Vec<OpInputEnum<'a, T>>, usize) {
-//         let len = self.len();
-//         let mut iters = vec![];
-//         let num_per_batch = match std::env::var("LAMELLAR_OP_BATCH") {
-//             Ok(n) => n.parse::<usize>().unwrap(),
-//             Err(_) => 10000,
-//         };
-//         let num = (len as f32 / num_per_batch as f32).ceil() as usize;
-//         println!("num: {}", num);
-//         for i in (1..num).rev() {
-//             let temp = self.split_off(i * num_per_batch);
-//             // println!("temp: {:?} {:?} {:?}", temp,i ,i * num_per_batch);
-//             iters.push(OpInputEnum::Vec(temp));
-//         }
-//         let rem = len % num_per_batch;
-//         // println!("rem: {} {:?}", rem,self);
-//         // if rem > 0 || num == 1 {
-//         if self.len() > 0 {
-//             iters.push(OpInputEnum::Vec(self));
-//         }
-//         iters.reverse(); //the indice slices get pushed in from the back, but we want to return in order
-//         (iters, len)
-//     }
-// }
-
 impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
     //#[tracing::instrument(skip_all)]
     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
@@ -608,15 +565,6 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
                 Some(n) => n,
                 None => std::cmp::max(1, config().threads / 4),
             }
-            // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-            //     Ok(n) => n.parse::<usize>().unwrap(),
-            //     Err(_) => {
-            //         match std::env::var("LAMELLAR_THREADS") {
-            //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-            //             Err(_) => 4, //+ 1 to account for main thread
-            //         }
-            //     }
-            // }
         };
         let num_per_batch = len / num;
         let iters = self
@@ -629,13 +577,6 @@ impl<'a, T: Dist> OpInput<'a, T> for Vec<T> {
     }
 }
 
-// impl<'a, T: Dist, I: Iterator<Item=T>> OpInput<'a, T> for I {
-//     //#[tracing::instrument(skip_all)]
-//     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
-//         self.collect::<Vec<T>>().as_op_input()
-//     }
-// }
-
 // impl<'a, T: Dist> OpInput<'a, T> for &OneSidedMemoryRegion<T> {
 //     //#[tracing::instrument(skip_all)]
 //     fn as_op_input(self) -> (Vec<OpInputEnum<'a, T>>, usize) {
@@ -737,15 +678,6 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a LocalLockLocalData<T> {
                     Some(n) => n,
                     None => std::cmp::max(1, config().threads / 4),
                 }
-                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                //     Ok(n) => n.parse::<usize>().unwrap(),
-                //     Err(_) => {
-                //         match std::env::var("LAMELLAR_THREADS") {
-                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4), //+ 1 to account for main thread
-                //             Err(_) => 4, //+ 1 to account for main thread
-                //         }
-                //     }
-                // }
             };
             let num_per_batch = len / num;
             // println!("num: {} len {:?} npb {:?}", num, len, num_per_batch);
@@ -786,15 +718,6 @@ impl<'a, T: Dist> OpInput<'a, T> for &'a GlobalLockLocalData<T> {
                     Some(n) => n,
                     None => std::cmp::max(1, config().threads / 4),
                 }
-                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                //     Ok(n) => n.parse::<usize>().unwrap(),
-                //     Err(_) => {
-                //         match std::env::var("LAMELLAR_THREADS") {
-                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4), //+ 1 to account for main thread
-                //             Err(_) => 4, //+ 1 to account for main thread
-                //         }
-                //     }
-                // }
             };
             let num_per_batch = len / num;
             // println!("num: {} len {:?} npb {:?}", num, len, num_per_batch);
@@ -869,15 +792,6 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for &GenericAtomicLocalData<T> {
                     Some(n) => n,
                     None => std::cmp::max(1, config().threads / 4),
                 }
-                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                //     Ok(n) => n.parse::<usize>().unwrap(),
-                //     Err(_) => {
-                //         match std::env::var("LAMELLAR_THREADS") {
-                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-                //             Err(_) => 4, //+ 1 to account for main thread
-                //         }
-                //     }
-                // }
             };
             let num_per_batch = len / num;
             for i in 0..num {
@@ -923,15 +837,6 @@ impl<'a, T: Dist + ElementOps> OpInput<'a, T> for &NativeAtomicLocalData<T> {
                     Some(n) => n,
                     None => std::cmp::max(1, config().threads / 4),
                 }
-                // match std::env::var("LAMELLAR_BATCH_OP_THREADS") {
-                //     Ok(n) => n.parse::<usize>().unwrap(),
-                //     Err(_) => {
-                //         match std::env::var("LAMELLAR_THREADS") {
-                //             Ok(n) => std::cmp::max(1, (n.parse::<usize>().unwrap()) / 4),
-                //             Err(_) => 4, //+ 1 to account for main thread
-                //         }
-                //     }
-                // }
             };
             let num_per_batch = len / num;
             // println!("num: {} len {:?} npb {:?}", num, len, num_per_batch);

From c3fac9cb61cbc9d8f5e8df4ee40bd99a974783ac Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 18 Jul 2024 13:59:50 -0700
Subject: [PATCH 050/116] refactor DistIterLauncher to use default
 implementations so we dont have to reimplement for every array type

---
 README.md                                     |   2 +-
 examples/array_examples/global_lock_array.rs  |   2 +-
 examples/array_examples/onesided_iteration.rs |   2 +-
 src/array.rs                                  |   6 +
 src/array/atomic.rs                           |   8 +
 src/array/atomic/iteration.rs                 |  16 +-
 src/array/generic_atomic.rs                   |   2 +-
 src/array/generic_atomic/iteration.rs         | 325 ++++++++-------
 src/array/global_lock_atomic/iteration.rs     | 390 +++++++++---------
 src/array/iterator/distributed_iterator.rs    | 106 ++++-
 src/array/iterator/mod.rs                     |   2 +-
 src/array/iterator/one_sided_iterator.rs      |   4 +-
 src/array/local_lock_atomic/iteration.rs      | 324 ++++++++-------
 src/array/native_atomic.rs                    |   2 +-
 src/array/native_atomic/iteration.rs          | 327 ++++++++-------
 src/array/read_only/iteration.rs              | 324 ++++++++-------
 src/array/unsafe.rs                           |  31 +-
 src/array/unsafe/iteration.rs                 |   2 +-
 src/array/unsafe/iteration/consumer.rs        |  58 ++-
 src/array/unsafe/iteration/distributed.rs     | 284 ++++++++++++-
 src/array/unsafe/iteration/local.rs           |  60 +--
 src/lib.rs                                    |   2 +-
 tests/array/arithmetic_ops/add_test.rs        |  29 +-
 tests/array/arithmetic_ops/div_test.rs        |  15 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |  34 +-
 tests/array/arithmetic_ops/fetch_div_test.rs  |  15 +-
 tests/array/arithmetic_ops/fetch_mul_test.rs  |  15 +-
 tests/array/arithmetic_ops/fetch_rem_test.rs  |  15 +-
 tests/array/arithmetic_ops/fetch_sub_test.rs  |  21 +-
 tests/array/arithmetic_ops/mul_test.rs        |  15 +-
 tests/array/arithmetic_ops/rem_test.rs        |  15 +-
 tests/array/arithmetic_ops/sub_test.rs        |  21 +-
 .../array/atomic_ops/compare_exchange_test.rs |   2 +
 tests/array/atomic_ops/load_store_test.rs     |   9 +
 tests/array/atomic_ops/swap_test.rs           |   2 +
 tests/array/bitwise_ops/and_test.rs           |  17 +-
 tests/array/bitwise_ops/fetch_and_test.rs     |  15 +-
 tests/array/bitwise_ops/fetch_or_test.rs      |  15 +-
 tests/array/bitwise_ops/fetch_xor_test.rs     |  15 +-
 tests/array/bitwise_ops/or_test.rs            |  15 +-
 tests/array/bitwise_ops/xor_test.rs           |  16 +-
 tests/array/rdma/put_test.rs                  |  15 +-
 42 files changed, 1583 insertions(+), 1012 deletions(-)

diff --git a/README.md b/README.md
index 2aae3b6d..780929bf 100644
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ fn main(){
     block_array.wait_all();
     block_array.barrier();
     if my_pe == 0{
-        for (i,elem) in block_array.onesided_iter().into_iter().enumerate(){ //iterate through entire array on pe 0 (automatically transfering remote data)
+        for (i,elem) in block_onesided_iter!($array,array).into_iter().enumerate(){ //iterate through entire array on pe 0 (automatically transfering remote data)
             println!("i: {} = {})",i,elem);
         }
     }
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 8b904396..6521432d 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -47,7 +47,7 @@ fn main() {
     array.print();
     println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
-    let task = array.dist_iter().enumerate().for_each(move |(i, elem)| {
+    let task = array.blocking_read_lock().dist_iter().enumerate().for_each(move |(i, elem)| {
         println!(
             "{my_pe}, {:?}: {i} {:?}",
             std::thread::current().id(),
diff --git a/examples/array_examples/onesided_iteration.rs b/examples/array_examples/onesided_iteration.rs
index eb5efbc3..23b471ca 100644
--- a/examples/array_examples/onesided_iteration.rs
+++ b/examples/array_examples/onesided_iteration.rs
@@ -104,7 +104,7 @@ fn main() {
     println!("--------------------------------------------------------");
 
     // let block_array = UnsafeArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    // for elem in block_array.onesided_iter().into_iter().step_by(4) {...}
+    // for elem in block_onesided_iter!($array,array).into_iter().step_by(4) {...}
     // for elem in block_array.buffered_onesided_iter(10) {...}
 
     // //rust step_by pseudo code
diff --git a/src/array.rs b/src/array.rs
index 06c8ff30..0c48ef39 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -809,6 +809,12 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarWriteArra
 //     }
 // }
 
+// private sealed trait
+#[doc(hidden)]
+pub trait InnerArray: Sized {
+    fn as_inner(&self) -> &r#unsafe::private::UnsafeArrayInner;
+}
+
 pub(crate) mod private {
     use crate::active_messaging::*;
     use crate::array::{
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 89578d00..e58d6687 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -3,6 +3,7 @@ pub(crate) mod operations;
 pub(crate) mod rdma;
 
 use crate::array::generic_atomic::{GenericAtomicElement, LocalGenericAtomicElement};
+
 use crate::array::native_atomic::NativeAtomicElement;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
@@ -510,6 +511,7 @@ impl<T: Dist + std::fmt::Debug> std::fmt::Debug for AtomicElement<T> {
 ///
 /// Generally any operation on this array type will be performed via an internal runtime Active Message, i.e. direct RDMA operations are not allowed
 #[enum_dispatch(LamellarArray<T>,LamellarEnv,LamellarArrayInternalGet<T>,LamellarArrayInternalPut<T>,ArrayExecAm<T>,LamellarArrayPrivate<T>,DistIteratorLauncher,LocalIteratorLauncher)]
+// #[enum_dispatch(LamellarArray<T>,LamellarEnv,LamellarArrayInternalGet<T>,LamellarArrayInternalPut<T>,ArrayExecAm<T>,LamellarArrayPrivate<T>)]
 #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
 #[serde(bound = "T: Dist + serde::Serialize + serde::de::DeserializeOwned + 'static")]
 pub enum AtomicArray<T: Dist> {
@@ -519,6 +521,12 @@ pub enum AtomicArray<T: Dist> {
     GenericAtomicArray(GenericAtomicArray<T>),
 }
 
+// impl<T: Dist> DistIteratorLauncher for AtomicArray<T> {
+//     // type Inner = Self;
+// }
+
+// impl<T: Dist> LocalIteratorLauncher for AtomicArray<T> {}
+
 impl<T: Dist + 'static> crate::active_messaging::DarcSerde for AtomicArray<T> {
     fn ser(&self, num_pes: usize, darcs: &mut Vec<RemotePtr>) {
         match self {
diff --git a/src/array/atomic/iteration.rs b/src/array/atomic/iteration.rs
index 222b5813..3dd68d87 100644
--- a/src/array/atomic/iteration.rs
+++ b/src/array/atomic/iteration.rs
@@ -1,11 +1,25 @@
 use crate::array::atomic::*;
+
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
-use crate::array::iterator::{private::*, LamellarArrayIterators, LamellarArrayMutIterators};
+use crate::array::iterator::{
+    private::{IterClone, Sealed},
+    LamellarArrayIterators, LamellarArrayMutIterators,
+};
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
 
+impl<T: Dist> InnerArray for AtomicArray<T> {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        match &self {
+            AtomicArray::NativeAtomicArray(a) => a.as_inner(),
+            AtomicArray::GenericAtomicArray(a) => a.as_inner(),
+        }
+    }
+}
+
 #[derive(Clone)]
 pub struct AtomicDistIter<T: Dist> {
     //dont need a AtomicDistIterMut in this case as any updates to inner elements are atomic
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 98e2bd3d..6ffa395d 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -2,7 +2,7 @@ pub(crate) mod iteration;
 pub(crate) mod operations;
 mod rdma;
 use crate::array::atomic::AtomicElement;
-use crate::array::private::LamellarArrayPrivate;
+// use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
 use crate::darc::Darc;
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index 6cdf4910..3a3f2080 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -1,10 +1,12 @@
 use crate::array::generic_atomic::*;
+
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
     private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
 };
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
 // use parking_lot::{
@@ -12,6 +14,12 @@ use crate::memregion::Dist;
 //     RawRwLock,
 // };
 
+impl<T> InnerArray for GenericAtomicArray<T> {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        &self.array.inner
+    }
+}
+
 //#[doc(hidden)]
 #[derive(Clone)]
 pub struct GenericAtomicDistIter<T: Dist> {
@@ -210,164 +218,165 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GenericAtomicArray<T> {
     }
 }
 
-impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {
-    fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.global_index_from_local(index, chunk_size)
-    }
-
-    fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.subarray_index_from_local(index, chunk_size)
-    }
-
-    // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-    //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-    // }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async(&self.array, iter, d)
-    }
-
-    fn collect_async_with_schedule<I, A, B>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-    }
-
-    fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {}
+//     // type Inner = Self;
+//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.global_index_from_local(index, chunk_size)
+//     }
+
+//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.subarray_index_from_local(index, chunk_size)
+//     }
+
+//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
+//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
+//     // }
+
+//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each(&self.array, iter, op)
+//     }
+//     fn for_each_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
+//     }
+//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
+//     }
+//     fn for_each_async_with_schedule<I, F, Fut>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce(&self.array, iter, op)
+//     }
+
+//     fn reduce_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect(&self.array, iter, d)
+//     }
+
+//     fn collect_with_schedule<I, A>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
+//     }
+//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async(&self.array, iter, d)
+//     }
+
+//     fn collect_async_with_schedule<I, A, B>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
+//     }
+
+//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count(&self.array, iter)
+//     }
+
+//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum(&self.array, iter)
+//     }
+
+//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+//         self.array.team_rt().clone()
+//     }
+// }
 
 impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index a5f642ff..8ca22109 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -1,20 +1,27 @@
 use crate::array::global_lock_atomic::*;
+
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
-    private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
+    private::{IterClone, Sealed},
+    LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
 };
 use crate::array::private::LamellarArrayPrivate;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
-use crate::darc::global_rw_darc::GlobalRwDarcReadGuard;
 use crate::memregion::Dist;
 
+impl<T> InnerArray for GlobalLockArray<T> {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        &self.array.inner
+    }
+}
+
 //#[doc(hidden)]
 #[derive(Clone)]
 pub struct GlobalLockDistIter<T: Dist> {
-    data: GlobalLockArray<T>,
-    lock: GlobalRwDarcReadGuard<()>,
+    array_guard: GlobalLockReadGuard<T>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'static T>,
@@ -23,8 +30,7 @@ pub struct GlobalLockDistIter<T: Dist> {
 impl<T: Dist> IterClone for GlobalLockDistIter<T> {
     fn iter_clone(&self, _: Sealed) -> Self {
         GlobalLockDistIter {
-            data: self.data.clone(),
-            lock: self.lock.clone(),
+            array_guard: self.array_guard.clone(),
             cur_i: self.cur_i,
             end_i: self.end_i,
             _marker: PhantomData,
@@ -37,7 +43,7 @@ impl<T: Dist> std::fmt::Debug for GlobalLockDistIter<T> {
         write!(
             f,
             "GlobalLockDistIter{{ data.len: {:?}, cur_i: {:?}, end_i: {:?} }}",
-            self.data.len(),
+            self.array_guard.array.len(),
             self.cur_i,
             self.end_i
         )
@@ -47,8 +53,7 @@ impl<T: Dist> std::fmt::Debug for GlobalLockDistIter<T> {
 //#[doc(hidden)]
 #[derive(Clone)]
 pub struct GlobalLockLocalIter<T: Dist> {
-    data: GlobalLockArray<T>,
-    lock: GlobalRwDarcReadGuard<()>,
+    array_guard: GlobalLockReadGuard<T>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'static T>,
@@ -57,8 +62,7 @@ pub struct GlobalLockLocalIter<T: Dist> {
 impl<T: Dist> IterClone for GlobalLockLocalIter<T> {
     fn iter_clone(&self, _: Sealed) -> Self {
         GlobalLockLocalIter {
-            data: self.data.clone(),
-            lock: self.lock.clone(),
+            array_guard: self.array_guard.clone(),
             cur_i: self.cur_i,
             end_i: self.end_i,
             _marker: PhantomData,
@@ -71,7 +75,7 @@ impl<T: Dist> std::fmt::Debug for GlobalLockLocalIter<T> {
         write!(
             f,
             "GlobalLockLocalIter{{ data.len: {:?}, cur_i: {:?}, end_i: {:?} }}",
-            self.data.len(),
+            self.array_guard.array.len(),
             self.cur_i,
             self.end_i
         )
@@ -82,24 +86,24 @@ impl<T: Dist + 'static> DistributedIterator for GlobalLockDistIter<T> {
     type Item = &'static T;
     type Array = GlobalLockArray<T>;
     fn init(&self, start_i: usize, cnt: usize) -> Self {
-        let max_i = self.data.num_elems_local();
+        let max_i = self.array_guard.array.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         GlobalLockDistIter {
-            data: self.data.clone(),
-            lock: self.lock.clone(),
+            array_guard: self.array_guard.clone(),
             cur_i: std::cmp::min(start_i, max_i),
             end_i: std::cmp::min(start_i + cnt, max_i),
             _marker: PhantomData,
         }
     }
     fn array(&self) -> Self::Array {
-        self.data.clone()
+        self.array_guard.array.clone()
     }
     fn next(&mut self) -> Option<Self::Item> {
         if self.cur_i < self.end_i {
             self.cur_i += 1;
             unsafe {
-                self.data
+                self.array_guard
+                    .array
                     .array
                     .local_as_ptr()
                     .offset((self.cur_i - 1) as isize)
@@ -118,7 +122,7 @@ impl<T: Dist + 'static> DistributedIterator for GlobalLockDistIter<T> {
 }
 impl<T: Dist + 'static> IndexedDistributedIterator for GlobalLockDistIter<T> {
     fn iterator_index(&self, index: usize) -> Option<usize> {
-        let g_index = self.data.subarray_index_from_local(index, 1);
+        let g_index = self.array_guard.array.subarray_index_from_local(index, 1);
         g_index
     }
 }
@@ -127,24 +131,24 @@ impl<T: Dist + 'static> LocalIterator for GlobalLockLocalIter<T> {
     type Item = &'static T;
     type Array = GlobalLockArray<T>;
     fn init(&self, start_i: usize, cnt: usize) -> Self {
-        let max_i = self.data.num_elems_local();
+        let max_i = self.array_guard.array.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         GlobalLockLocalIter {
-            data: self.data.clone(),
-            lock: self.lock.clone(),
+            array_guard: self.array_guard.clone(),
             cur_i: std::cmp::min(start_i, max_i),
             end_i: std::cmp::min(start_i + cnt, max_i),
             _marker: PhantomData,
         }
     }
     fn array(&self) -> Self::Array {
-        self.data.clone()
+        self.array_guard.array.clone()
     }
     fn next(&mut self) -> Option<Self::Item> {
         if self.cur_i < self.end_i {
             self.cur_i += 1;
             unsafe {
-                self.data
+                self.array_guard
+                    .array
                     .array
                     .local_as_ptr()
                     .offset((self.cur_i - 1) as isize)
@@ -165,7 +169,7 @@ impl<T: Dist + 'static> LocalIterator for GlobalLockLocalIter<T> {
 
 impl<T: Dist + 'static> IndexedLocalIterator for GlobalLockLocalIter<T> {
     fn iterator_index(&self, index: usize) -> Option<usize> {
-        if index < self.data.len() {
+        if index < self.array_guard.array.len() {
             Some(index) //everyone at this point as calculated the actual index (cause we are local only) so just return it
         } else {
             None
@@ -338,19 +342,15 @@ impl<T: Dist + 'static> IndexedLocalIterator for GlobalLockLocalIterMut<T> {
     }
 }
 
-impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
+impl<T: Dist> LamellarArrayIterators<T> for GlobalLockReadGuard<T> {
     // type Array = GlobalLockArray<T>;
     type DistIter = GlobalLockDistIter<T>;
     type LocalIter = GlobalLockLocalIter<T>;
-    type OnesidedIter = OneSidedIter<'static, T, Self>;
+    type OnesidedIter = OneSidedIter<'static, T, GlobalLockArray<T>>;
 
     fn dist_iter(&self) -> Self::DistIter {
-        let lock: GlobalRwDarc<()> = self.lock.clone();
-        let lock = self.array.block_on(async move { lock.read().await });
-        self.barrier();
         GlobalLockDistIter {
-            data: self.clone(),
-            lock: lock,
+            array_guard: self.clone(),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -358,11 +358,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock: GlobalRwDarc<()> = self.lock.clone();
-        let lock = self.array.block_on(async move { lock.read().await });
         GlobalLockLocalIter {
-            data: self.clone(),
-            lock: lock,
+            array_guard: self.clone(),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -370,14 +367,14 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.clone().into(), self.array.team_rt().clone(), 1)
+        OneSidedIter::new(self.array.clone().into(), self.array.team_rt().clone(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.clone().into(),
+            self.array.clone().into(),
             self.array.team_rt().clone(),
-            std::cmp::min(buf_size, self.len()),
+            std::cmp::min(buf_size, self.array.len()),
         )
     }
 }
@@ -416,164 +413,165 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     }
 }
 
-impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {
-    fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.global_index_from_local(index, chunk_size)
-    }
-
-    fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.subarray_index_from_local(index, chunk_size)
-    }
-
-    // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-    //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-    // }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async(&self.array, iter, d)
-    }
-
-    fn collect_async_with_schedule<I, A, B>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-    }
-
-    fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {}
+//     // type Inner = Self;
+//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.global_index_from_local(index, chunk_size)
+//     }
+
+//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.subarray_index_from_local(index, chunk_size)
+//     }
+
+//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
+//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
+//     // }
+
+//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each(&self.array, iter, op)
+//     }
+//     fn for_each_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
+//     }
+//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
+//     }
+//     fn for_each_async_with_schedule<I, F, Fut>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce(&self.array, iter, op)
+//     }
+
+//     fn reduce_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect(&self.array, iter, d)
+//     }
+
+//     fn collect_with_schedule<I, A>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
+//     }
+//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async(&self.array, iter, d)
+//     }
+
+//     fn collect_async_with_schedule<I, A, B>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
+//     }
+
+//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count(&self.array, iter)
+//     }
+
+//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum(&self.array, iter)
+//     }
+
+//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+//         self.array.team_rt().clone()
+//     }
+// }
 
 impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index d1ce490b..339f722b 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -38,7 +38,7 @@ pub(crate) use consumer::*;
 
 use crate::array::iterator::{private::*, Schedule};
 use crate::array::{
-    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray,
+    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray, InnerArray,
     LamellarArray, NativeAtomicArray,
 };
 use crate::memregion::Dist;
@@ -147,11 +147,17 @@ use std::sync::Arc;
 
 #[doc(hidden)]
 #[enum_dispatch]
-pub trait DistIteratorLauncher {
+pub trait DistIteratorLauncher: InnerArray {
+    // type Inner: InnerArray;
     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static;
+        F: Fn(I::Item) + SyncSend + Clone + 'static,
+        Self: InnerArray,
+    {
+        // DistIteratorLauncher::for_each_with_schedule(self, Schedule::Static, iter, op)
+        self.as_inner().for_each(iter, op)
+    }
 
     fn for_each_with_schedule<I, F>(
         &self,
@@ -161,13 +167,19 @@ pub trait DistIteratorLauncher {
     ) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static;
+        F: Fn(I::Item) + SyncSend + Clone + 'static,
+    {
+        self.as_inner().for_each_with_schedule(sched, iter, op)
+    }
 
     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static;
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.as_inner().for_each_async(iter, op)
+    }
 
     fn for_each_async_with_schedule<I, F, Fut>(
         &self,
@@ -178,13 +190,20 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static;
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.as_inner()
+            .for_each_async_with_schedule(sched, iter, op)
+    }
 
     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static;
+        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+    {
+        self.as_inner().reduce(iter, op)
+    }
 
     fn reduce_with_schedule<I, F>(
         &self,
@@ -195,13 +214,19 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static;
+        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+    {
+        self.as_inner().reduce_with_schedule(sched, iter, op)
+    }
 
     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.as_inner().collect(iter, d)
+    }
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -212,14 +237,20 @@ pub trait DistIteratorLauncher {
     where
         I: DistributedIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.as_inner().collect_with_schedule(sched, iter, d)
+    }
 
     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
     where
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.as_inner().collect_async(iter, d)
+    }
 
     fn collect_async_with_schedule<I, A, B>(
         &self,
@@ -231,37 +262,74 @@ pub trait DistIteratorLauncher {
         I: DistributedIterator,
         I::Item: Future<Output = B> + Send + 'static,
         B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.as_inner().collect_async_with_schedule(sched, iter, d)
+    }
 
     fn count<I>(&self, iter: &I) -> DistIterCountHandle
     where
-        I: DistributedIterator + 'static;
+        I: DistributedIterator + 'static,
+    {
+        self.as_inner().count(iter)
+    }
 
     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
     where
-        I: DistributedIterator + 'static;
+        I: DistributedIterator + 'static,
+    {
+        self.as_inner().count_with_schedule(sched, iter)
+    }
 
     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum;
+        I::Item: Dist + ArrayOps + std::iter::Sum,
+    {
+        self.as_inner().sum(iter)
+    }
 
     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
     where
         I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum;
+        I::Item: Dist + ArrayOps + std::iter::Sum,
+    {
+        self.as_inner().sum_with_schedule(sched, iter)
+    }
 
     //#[doc(hidden)]
-    fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
+    fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+        if chunk_size == 1 {
+            self.as_inner().global_index_from_local(index)
+        } else {
+            Some(
+                self.as_inner()
+                    .global_index_from_local(index * chunk_size)?
+                    / chunk_size,
+            )
+        }
+    }
 
     //#[doc(hidden)]
-    fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
+    fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+        if chunk_size == 1 {
+            self.as_inner().subarray_index_from_local(index)
+        } else {
+            Some(
+                self.as_inner()
+                    .subarray_index_from_local(index * chunk_size)?
+                    / chunk_size,
+            )
+        }
+    }
 
     // //#[doc(hidden)]
     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)>;
 
     //#[doc(hidden)]
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
+    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+        self.as_inner().team()
+    }
 }
 
 /// An interface for dealing with distributed iterators (intended as a parallel and distributed version of the standard iterator trait)
diff --git a/src/array/iterator/mod.rs b/src/array/iterator/mod.rs
index 57d59234..389abc9d 100644
--- a/src/array/iterator/mod.rs
+++ b/src/array/iterator/mod.rs
@@ -115,7 +115,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
     ///
     /// if my_pe == 0 {
-    ///     for elem in array.onesided_iter().into_iter() { //"into_iter()" converts into a standard Rust Iterator
+    ///     for elem in onesided_iter!($array,array).into_iter() { //"into_iter()" converts into a standard Rust Iterator
     ///         println!("PE{my_pe} elem {elem}");
     ///     }
     /// }
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index 633f4949..51b04e57 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -255,7 +255,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array using a distributed iterator
     /// array.wait_all();
     /// if my_pe == 0 {
-    ///     let sum = array.onesided_iter().into_iter().take(4).map(|elem| *elem as f64).sum::<f64>();
+    ///     let sum = onesided_iter!($array,array).into_iter().take(4).map(|elem| *elem as f64).sum::<f64>();
     ///     println!("Sum: {sum}")
     /// }
     /// ```
@@ -324,7 +324,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
 /// let world = LamellarWorldBuilder::new().build();
 /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
 ///
-/// let std_iter = array.onesided_iter().into_iter();
+/// let std_iter = onesided_iter!($array,array).into_iter();
 /// for e in std_iter {
 ///     println!("{e}");
 /// }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 23b249e7..9f41303d 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -6,6 +6,7 @@ use crate::array::iterator::{
 };
 use crate::array::local_lock_atomic::*;
 use crate::array::private::LamellarArrayPrivate;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
 // use parking_lot::{
@@ -14,6 +15,12 @@ use crate::memregion::Dist;
 // };
 use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
 
+impl<T> InnerArray for LocalLockArray<T> {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        &self.array.inner
+    }
+}
+
 //#[doc(hidden)]
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
@@ -420,164 +427,165 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     }
 }
 
-impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {
-    fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.global_index_from_local(index, chunk_size)
-    }
-
-    fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.subarray_index_from_local(index, chunk_size)
-    }
-
-    // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-    //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-    // }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async(&self.array, iter, d)
-    }
-
-    fn collect_async_with_schedule<I, A, B>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-    }
-
-    fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {}
+//     // type Inner = Self;
+//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.global_index_from_local(index, chunk_size)
+//     }
+
+//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.subarray_index_from_local(index, chunk_size)
+//     }
+
+//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
+//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
+//     // }
+
+//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each(&self.array, iter, op)
+//     }
+//     fn for_each_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
+//     }
+//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
+//     }
+//     fn for_each_async_with_schedule<I, F, Fut>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce(&self.array, iter, op)
+//     }
+
+//     fn reduce_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect(&self.array, iter, d)
+//     }
+
+//     fn collect_with_schedule<I, A>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
+//     }
+//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async(&self.array, iter, d)
+//     }
+
+//     fn collect_async_with_schedule<I, A, B>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
+//     }
+
+//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count(&self.array, iter)
+//     }
+
+//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum(&self.array, iter)
+//     }
+
+//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+//         self.array.team_rt().clone()
+//     }
+// }
 
 impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 065fa127..83946563 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -2,7 +2,7 @@ pub(crate) mod iteration;
 pub(crate) mod operations;
 mod rdma;
 use crate::array::atomic::AtomicElement;
-use crate::array::private::LamellarArrayPrivate;
+// use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
 // use crate::darc::Darc;
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index d76bab0d..9411b110 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -2,9 +2,11 @@ use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
-    private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
+    private::{IterClone, Sealed},
+    LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
 };
 use crate::array::native_atomic::*;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
 // use parking_lot::{
@@ -12,6 +14,12 @@ use crate::memregion::Dist;
 //     RawRwLock,
 // };
 
+impl<T> InnerArray for NativeAtomicArray<T> {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        &self.array.inner
+    }
+}
+
 //#[doc(hidden)]
 #[derive(Clone)]
 pub struct NativeAtomicDistIter<T: Dist> {
@@ -211,164 +219,165 @@ impl<T: Dist> LamellarArrayMutIterators<T> for NativeAtomicArray<T> {
     }
 }
 
-impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {
-    fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.global_index_from_local(index, chunk_size)
-    }
-
-    fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.subarray_index_from_local(index, chunk_size)
-    }
-
-    // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-    //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-    // }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async(&self.array, iter, d)
-    }
-
-    fn collect_async_with_schedule<I, A, B>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-    }
-
-    fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {}
+//     // type Inner = Self;
+//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.global_index_from_local(index, chunk_size)
+//     }
+
+//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         self.array.subarray_index_from_local(index, chunk_size)
+//     }
+
+//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
+//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
+//     // }
+
+//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each(&self.array, iter, op)
+//     }
+//     fn for_each_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
+//     }
+//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
+//     }
+//     fn for_each_async_with_schedule<I, F, Fut>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce(&self.array, iter, op)
+//     }
+
+//     fn reduce_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
+//     }
+
+//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect(&self.array, iter, d)
+//     }
+
+//     fn collect_with_schedule<I, A>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
+//     }
+//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async(&self.array, iter, d)
+//     }
+
+//     fn collect_async_with_schedule<I, A, B>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
+//     }
+
+//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count(&self.array, iter)
+//     }
+
+//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum(&self.array, iter)
+//     }
+
+//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
+//     }
+
+//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+//         self.array.team_rt().clone()
+//     }
+// }
 
 impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
diff --git a/src/array/read_only/iteration.rs b/src/array/read_only/iteration.rs
index 8d015740..cbcd3615 100644
--- a/src/array/read_only/iteration.rs
+++ b/src/array/read_only/iteration.rs
@@ -2,10 +2,17 @@ use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{LamellarArrayIterators, Schedule};
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::read_only::*;
 use crate::array::*;
 use crate::memregion::Dist;
 
+impl<T> InnerArray for ReadOnlyArray<T> {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        &self.array.inner
+    }
+}
+
 impl<T: Dist> LamellarArrayIterators<T> for ReadOnlyArray<T> {
     // type Array = ReadOnlyArray<T>;
     type DistIter = DistIter<'static, T, Self>;
@@ -32,164 +39,165 @@ impl<T: Dist> LamellarArrayIterators<T> for ReadOnlyArray<T> {
     }
 }
 
-impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {
-    fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.global_index_from_local(index, chunk_size)
-    }
-
-    fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.subarray_index_from_local(index, chunk_size)
-    }
-
-    // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-    //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-    // }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async(&self.array, iter, d)
-    }
-
-    fn collect_async_with_schedule<I, A, B>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-    }
-
-    fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {}
+// // type Inner = Self;
+// fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//     self.array.global_index_from_local(index, chunk_size)
+// }
+
+// fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//     self.array.subarray_index_from_local(index, chunk_size)
+// }
+
+// // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
+// //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
+// // }
+
+// fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
+// where
+//     I: DistributedIterator + 'static,
+//     F: Fn(I::Item) + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::for_each(&self.array, iter, op)
+// }
+// fn for_each_with_schedule<I, F>(
+//     &self,
+//     sched: Schedule,
+//     iter: &I,
+//     op: F,
+// ) -> DistIterForEachHandle
+// where
+//     I: DistributedIterator + 'static,
+//     F: Fn(I::Item) + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
+// }
+// fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
+// where
+//     I: DistributedIterator + 'static,
+//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//     Fut: Future<Output = ()> + Send + 'static,
+// {
+//     DistIteratorLauncher::for_each_async(&self.array, iter, op)
+// }
+// fn for_each_async_with_schedule<I, F, Fut>(
+//     &self,
+//     sched: Schedule,
+//     iter: &I,
+//     op: F,
+// ) -> DistIterForEachHandle
+// where
+//     I: DistributedIterator + 'static,
+//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//     Fut: Future<Output = ()> + Send + 'static,
+// {
+//     DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
+// }
+
+// fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Dist + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::reduce(&self.array, iter, op)
+// }
+
+// fn reduce_with_schedule<I, F>(
+//     &self,
+//     sched: Schedule,
+//     iter: &I,
+//     op: F,
+// ) -> DistIterReduceHandle<I::Item, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Dist + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
+// }
+
+// fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Dist + ArrayOps,
+//     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::collect(&self.array, iter, d)
+// }
+
+// fn collect_with_schedule<I, A>(
+//     &self,
+//     sched: Schedule,
+//     iter: &I,
+//     d: Distribution,
+// ) -> DistIterCollectHandle<I::Item, A>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Dist + ArrayOps,
+//     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
+// }
+// fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
+// where
+//     I: DistributedIterator,
+//     I::Item: Future<Output = B> + Send + 'static,
+//     B: Dist + ArrayOps,
+//     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::collect_async(&self.array, iter, d)
+// }
+
+// fn collect_async_with_schedule<I, A, B>(
+//     &self,
+//     sched: Schedule,
+//     iter: &I,
+//     d: Distribution,
+// ) -> DistIterCollectHandle<B, A>
+// where
+//     I: DistributedIterator,
+//     I::Item: Future<Output = B> + Send + 'static,
+//     B: Dist + ArrayOps,
+//     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+// {
+//     DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
+// }
+
+// fn count<I>(&self, iter: &I) -> DistIterCountHandle
+// where
+//     I: DistributedIterator + 'static,
+// {
+//     DistIteratorLauncher::count(&self.array, iter)
+// }
+
+// fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
+// where
+//     I: DistributedIterator + 'static,
+// {
+//     DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
+// }
+
+// fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Dist + ArrayOps + std::iter::Sum,
+// {
+//     DistIteratorLauncher::sum(&self.array, iter)
+// }
+
+// fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Dist + ArrayOps + std::iter::Sum,
+// {
+//     DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
+// }
+
+// fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+//         self.array.team_rt().clone()
+//     }
+// }
 
 impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 28e07ab1..f50b12a4 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -7,6 +7,7 @@ mod rdma;
 
 use crate::active_messaging::*;
 // use crate::array::r#unsafe::operations::BUFOPS;
+use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
 use crate::array::*;
 use crate::array::{LamellarRead, LamellarWrite};
 use crate::darc::{Darc, DarcMode, WeakDarc};
@@ -89,17 +90,23 @@ impl UnsafeByteArrayWeak {
     }
 }
 
-#[lamellar_impl::AmDataRT(Clone, Debug)]
-pub(crate) struct UnsafeArrayInner {
-    pub(crate) data: Darc<UnsafeArrayData>,
-    pub(crate) distribution: Distribution,
-    orig_elem_per_pe: usize,
-    orig_remaining_elems: usize,
-    elem_size: usize, //for bytes array will be size of T, for T array will be 1
-    offset: usize,    //relative to size of T
-    pub(crate) size: usize, //relative to size of T
-    sub: bool,
+pub(crate) mod private {
+    use super::UnsafeArrayData;
+    use crate::array::Distribution;
+    use crate::darc::Darc;
+    #[lamellar_impl::AmDataRT(Clone, Debug)]
+    pub struct UnsafeArrayInner {
+        pub(crate) data: Darc<UnsafeArrayData>,
+        pub(crate) distribution: Distribution,
+        pub(crate) orig_elem_per_pe: usize,
+        pub(crate) orig_remaining_elems: usize,
+        pub(crate) elem_size: usize, //for bytes array will be size of T, for T array will be 1
+        pub(crate) offset: usize,    //relative to size of T
+        pub(crate) size: usize,      //relative to size of T
+        pub(crate) sub: bool,
+    }
 }
+use private::UnsafeArrayInner;
 
 #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
 pub(crate) struct UnsafeArrayInnerWeak {
@@ -944,7 +951,7 @@ impl<T: Dist> From<LamellarByteArray> for UnsafeArray<T> {
     }
 }
 
-impl<T: Dist> private::ArrayExecAm<T> for UnsafeArray<T> {
+impl<T: Dist> ArrayExecAm<T> for UnsafeArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.team_rt().clone()
     }
@@ -952,7 +959,7 @@ impl<T: Dist> private::ArrayExecAm<T> for UnsafeArray<T> {
         self.inner.data.array_counters.clone()
     }
 }
-impl<T: Dist> private::LamellarArrayPrivate<T> for UnsafeArray<T> {
+impl<T: Dist> LamellarArrayPrivate<T> for UnsafeArray<T> {
     fn inner_array(&self) -> &UnsafeArray<T> {
         self
     }
diff --git a/src/array/unsafe/iteration.rs b/src/array/unsafe/iteration.rs
index 05011453..42dffa05 100644
--- a/src/array/unsafe/iteration.rs
+++ b/src/array/unsafe/iteration.rs
@@ -140,7 +140,7 @@ impl<T: Dist> UnsafeArray<T> {
     ///
     /// unsafe {
     ///     if my_pe == 0 {
-    ///         for elem in array.onesided_iter().into_iter() { //"into_iter()" converts into a standard Rust Iterator
+    ///         for elem in onesided_iter!($array,array).into_iter() { //"into_iter()" converts into a standard Rust Iterator
     ///             println!("PE{my_pe} elem {elem}");
     ///         }
     ///     }
diff --git a/src/array/unsafe/iteration/consumer.rs b/src/array/unsafe/iteration/consumer.rs
index e9f1f04d..3fa4332f 100644
--- a/src/array/unsafe/iteration/consumer.rs
+++ b/src/array/unsafe/iteration/consumer.rs
@@ -1,15 +1,15 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::consumer::*;
-use crate::array::r#unsafe::UnsafeArray;
-use crate::array::LamellarArray;
-use crate::memregion::Dist;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
+// use crate::array::LamellarArray;
+// use crate::memregion::Dist;
 
 use parking_lot::Mutex;
 use std::collections::VecDeque;
 use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
 
-impl<T: Dist> UnsafeArray<T> {
+impl UnsafeArrayInner {
     pub(crate) fn sched_static<C, AmO, O, I>(&self, cons: C) -> C::Handle
     where
         C: IterConsumer<AmOutput = AmO, Output = O, Item = I>,
@@ -18,8 +18,8 @@ impl<T: Dist> UnsafeArray<T> {
         // I: SyncSend + 'static,
     {
         let mut reqs = VecDeque::new();
-        if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
-            let num_workers = self.inner.data.team.num_threads();
+        if let Ok(_my_pe) = self.data.team.team_pe_id() {
+            let num_workers = self.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
             let elems_per_thread = 1.0f64.max(num_elems_local as f64 / num_workers as f64);
             // println!("static num_workers {:?} num_elems_local {:?} elems_per_thread {:?}", num_workers, num_elems_local, elems_per_thread);
@@ -28,7 +28,7 @@ impl<T: Dist> UnsafeArray<T> {
                 let start_i = (worker as f64 * elems_per_thread).round() as usize;
                 let end_i = ((worker + 1) as f64 * elems_per_thread).round() as usize;
                 reqs.push_back(
-                    self.inner.data.task_group.exec_arc_am_local_inner(
+                    self.data.task_group.exec_arc_am_local_inner(
                         cons.into_am(IterSchedule::Static(start_i, end_i)),
                     ),
                 );
@@ -36,7 +36,7 @@ impl<T: Dist> UnsafeArray<T> {
                 worker += 1;
             }
         }
-        cons.create_handle(self.inner.data.team.clone(), reqs)
+        cons.create_handle(self.data.team.clone(), reqs)
     }
 
     pub(crate) fn sched_dynamic<C, AmO, O, I>(&self, cons: C) -> C::Handle
@@ -47,20 +47,20 @@ impl<T: Dist> UnsafeArray<T> {
         // I: SyncSend + 'static,
     {
         let mut reqs = VecDeque::new();
-        if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
-            let num_workers = self.inner.data.team.num_threads();
+        if let Ok(_my_pe) = self.data.team.team_pe_id() {
+            let num_workers = self.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
             // println!("dynamic num_workers {:?} num_elems_local {:?}", num_workers, num_elems_local);
 
             let cur_i = Arc::new(AtomicUsize::new(0));
             // println!("ranges {:?}", ranges);
             for _ in 0..std::cmp::min(num_workers, num_elems_local) {
-                reqs.push_back(self.inner.data.task_group.exec_arc_am_local_inner(
+                reqs.push_back(self.data.task_group.exec_arc_am_local_inner(
                     cons.into_am(IterSchedule::Dynamic(cur_i.clone(), num_elems_local)),
                 ));
             }
         }
-        cons.create_handle(self.inner.data.team.clone(), reqs)
+        cons.create_handle(self.data.team.clone(), reqs)
     }
 
     pub(crate) fn sched_work_stealing<C, AmO, O, I>(&self, cons: C) -> C::Handle
@@ -71,8 +71,8 @@ impl<T: Dist> UnsafeArray<T> {
         // I: SyncSend + 'static,
     {
         let mut reqs = VecDeque::new();
-        if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
-            let num_workers = self.inner.data.team.num_threads();
+        if let Ok(_my_pe) = self.data.team.team_pe_id() {
+            let num_workers = self.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
             let elems_per_thread = 1.0f64.max(num_elems_local as f64 / num_workers as f64);
             // println!("work stealing num_workers {:?} num_elems_local {:?} elems_per_thread {:?}", num_workers, num_elems_local, elems_per_thread);
@@ -87,18 +87,12 @@ impl<T: Dist> UnsafeArray<T> {
                 worker += 1;
             }
             for sibling in &siblings {
-                reqs.push_back(
-                    self.inner
-                        .data
-                        .task_group
-                        .exec_arc_am_local_inner(cons.into_am(IterSchedule::WorkStealing(
-                            sibling.clone(),
-                            siblings.clone(),
-                        ))),
-                )
+                reqs.push_back(self.data.task_group.exec_arc_am_local_inner(cons.into_am(
+                    IterSchedule::WorkStealing(sibling.clone(), siblings.clone()),
+                )))
             }
         }
-        cons.create_handle(self.inner.data.team.clone(), reqs)
+        cons.create_handle(self.data.team.clone(), reqs)
     }
 
     pub(crate) fn sched_guided<C, AmO, O, I>(&self, cons: C) -> C::Handle
@@ -109,8 +103,8 @@ impl<T: Dist> UnsafeArray<T> {
         // I: SyncSend + 'static,
     {
         let mut reqs = VecDeque::new();
-        if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
-            let num_workers = self.inner.data.team.num_threads();
+        if let Ok(_my_pe) = self.data.team.team_pe_id() {
+            let num_workers = self.data.team.num_threads();
             let num_elems_local_orig = cons.max_elems(self.num_elems_local());
             let mut num_elems_local = num_elems_local_orig as f64;
             let mut elems_per_thread = num_elems_local / num_workers as f64;
@@ -152,12 +146,12 @@ impl<T: Dist> UnsafeArray<T> {
             let range_i = Arc::new(AtomicUsize::new(0));
             // println!("ranges {:?}", ranges);
             for _ in 0..std::cmp::min(num_workers, num_elems_local_orig) {
-                reqs.push_back(self.inner.data.task_group.exec_arc_am_local_inner(
+                reqs.push_back(self.data.task_group.exec_arc_am_local_inner(
                     cons.into_am(IterSchedule::Chunk(ranges.clone(), range_i.clone())),
                 ));
             }
         }
-        cons.create_handle(self.inner.data.team.clone(), reqs)
+        cons.create_handle(self.data.team.clone(), reqs)
     }
 
     pub(crate) fn sched_chunk<C, AmO, O, I>(&self, cons: C, chunk_size: usize) -> C::Handle
@@ -168,8 +162,8 @@ impl<T: Dist> UnsafeArray<T> {
         // I: SyncSend + 'static,
     {
         let mut reqs = VecDeque::new();
-        if let Ok(_my_pe) = self.inner.data.team.team_pe_id() {
-            let num_workers = self.inner.data.team.num_threads();
+        if let Ok(_my_pe) = self.data.team.team_pe_id() {
+            let num_workers = self.data.team.num_threads();
             let num_elems_local = cons.max_elems(self.num_elems_local());
             let mut ranges = Vec::new();
             let mut cur_i = 0;
@@ -185,11 +179,11 @@ impl<T: Dist> UnsafeArray<T> {
             let range_i = Arc::new(AtomicUsize::new(0));
             // println!("ranges {:?}", ranges);
             for _ in 0..std::cmp::min(num_workers, num_chunks) {
-                reqs.push_back(self.inner.data.task_group.exec_arc_am_local_inner(
+                reqs.push_back(self.data.task_group.exec_arc_am_local_inner(
                     cons.into_am(IterSchedule::Chunk(ranges.clone(), range_i.clone())),
                 ));
             }
         }
-        cons.create_handle(self.inner.data.team.clone(), reqs)
+        cons.create_handle(self.data.team.clone(), reqs)
     }
 }
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 1eba52ae..2bb48c05 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -1,8 +1,8 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::distributed_iterator::*;
-use crate::array::iterator::private::*;
-use crate::array::r#unsafe::UnsafeArray;
-use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, LamellarArray};
+use crate::array::iterator::private::Sealed;
+use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
+use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, InnerArray};
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -13,21 +13,283 @@ use futures_util::Future;
 use std::pin::Pin;
 use std::sync::Arc;
 
-impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
+impl<T> InnerArray for UnsafeArray<T> {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        &self.inner
+    }
+}
+
+impl InnerArray for UnsafeArrayInner {
+    fn as_inner(&self) -> &UnsafeArrayInner {
+        &self
+    }
+}
+
+impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {}
+//     // type Inner = Self;
+//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         // println!("global index cs:{:?}",chunk_size);
+//         if chunk_size == 1 {
+//             self.inner.global_index_from_local(index)
+//         } else {
+//             Some(self.inner.global_index_from_local(index * chunk_size)? / chunk_size)
+//         }
+//     }
+
+//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+//         if chunk_size == 1 {
+//             self.inner.subarray_index_from_local(index)
+//         } else {
+//             Some(self.inner.subarray_index_from_local(index * chunk_size)? / chunk_size)
+//         }
+//     }
+
+//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
+//     //     if chunk_size == 1 {
+//     //         Some(self.calc_pe_and_offset(index))
+//     //     } else {
+//     //         Some(self.calc_pe_and_offset(index * chunk_size)? / chunk_size)
+//     //     }
+//     // }
+
+//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         self.for_each_with_schedule(Schedule::Static, iter, op)
+//     }
+
+//     fn for_each_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) + SyncSend + Clone + 'static,
+//     {
+//         let for_each = ForEach {
+//             iter: iter.iter_clone(Sealed),
+//             op,
+//         };
+//         self.barrier();
+//         match sched {
+//             Schedule::Static => self.inner.sched_static(for_each),
+//             Schedule::Dynamic => self.inner.sched_dynamic(for_each),
+//             Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
+//             Schedule::Guided => self.inner.sched_guided(for_each),
+//             Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
+//         }
+//     }
+
+//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         self.for_each_async_with_schedule(Schedule::Static, iter, op)
+//     }
+
+//     fn for_each_async_with_schedule<I, F, Fut>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterForEachHandle
+//     where
+//         I: DistributedIterator + 'static,
+//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+//         Fut: Future<Output = ()> + Send + 'static,
+//     {
+//         let for_each = ForEachAsync {
+//             iter: iter.iter_clone(Sealed),
+//             op,
+//         };
+//         self.barrier();
+//         match sched {
+//             Schedule::Static => self.inner.sched_static(for_each),
+//             Schedule::Dynamic => self.inner.sched_dynamic(for_each),
+//             Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
+//             Schedule::Guided => self.inner.sched_guided(for_each),
+//             Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
+//         }
+//     }
+
+//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         self.reduce_with_schedule(Schedule::Static, iter, op)
+//     }
+
+//     fn reduce_with_schedule<I, F>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         op: F,
+//     ) -> DistIterReduceHandle<I::Item, F>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+//     {
+//         let reduce = Reduce {
+//             iter: iter.iter_clone(Sealed),
+//             op,
+//         };
+//         match sched {
+//             Schedule::Static => self.inner.sched_static(reduce),
+//             Schedule::Dynamic => self.inner.sched_dynamic(reduce),
+//             Schedule::Chunk(size) => self.inner.sched_chunk(reduce, size),
+//             Schedule::Guided => self.inner.sched_guided(reduce),
+//             Schedule::WorkStealing => self.inner.sched_work_stealing(reduce),
+//         }
+//     }
+
+//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         self.collect_with_schedule(Schedule::Static, iter, d)
+//     }
+
+//     fn collect_with_schedule<I, A>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<I::Item, A>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         let collect = Collect {
+//             iter: iter.iter_clone(Sealed).monotonic(),
+//             distribution: d,
+//             _phantom: PhantomData,
+//         };
+//         match sched {
+//             Schedule::Static => self.inner.sched_static(collect),
+//             Schedule::Dynamic => self.inner.sched_dynamic(collect),
+//             Schedule::Chunk(size) => self.inner.sched_chunk(collect, size),
+//             Schedule::Guided => self.inner.sched_guided(collect),
+//             Schedule::WorkStealing => self.inner.sched_work_stealing(collect),
+//         }
+//     }
+
+//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         self.collect_async_with_schedule(Schedule::Static, iter, d)
+//     }
+
+//     fn collect_async_with_schedule<I, A, B>(
+//         &self,
+//         sched: Schedule,
+//         iter: &I,
+//         d: Distribution,
+//     ) -> DistIterCollectHandle<B, A>
+//     where
+//         I: DistributedIterator,
+//         I::Item: Future<Output = B> + Send + 'static,
+//         B: Dist + ArrayOps,
+//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+//     {
+//         let collect = CollectAsync {
+//             iter: iter.iter_clone(Sealed).monotonic(),
+//             distribution: d,
+//             _phantom: PhantomData,
+//         };
+//         match sched {
+//             Schedule::Static => self.inner.sched_static(collect),
+//             Schedule::Dynamic => self.inner.sched_dynamic(collect),
+//             Schedule::Chunk(size) => self.inner.sched_chunk(collect, size),
+//             Schedule::Guided => self.inner.sched_guided(collect),
+//             Schedule::WorkStealing => self.inner.sched_work_stealing(collect),
+//         }
+//     }
+
+//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         self.count_with_schedule(Schedule::Static, iter)
+//     }
+
+//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
+//     where
+//         I: DistributedIterator + 'static,
+//     {
+//         let count = Count {
+//             iter: iter.iter_clone(Sealed),
+//         };
+//         match sched {
+//             Schedule::Static => self.inner.sched_static(count),
+//             Schedule::Dynamic => self.inner.sched_dynamic(count),
+//             Schedule::Chunk(size) => self.inner.sched_chunk(count, size),
+//             Schedule::Guided => self.inner.sched_guided(count),
+//             Schedule::WorkStealing => self.inner.sched_work_stealing(count),
+//         }
+//     }
+
+//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         self.sum_with_schedule(Schedule::Static, iter)
+//     }
+
+//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
+//     where
+//         I: DistributedIterator + 'static,
+//         I::Item: Dist + ArrayOps + std::iter::Sum,
+//     {
+//         let sum = Sum {
+//             iter: iter.iter_clone(Sealed),
+//         };
+//         match sched {
+//             Schedule::Static => self.inner.sched_static(sum),
+//             Schedule::Dynamic => self.inner.sched_dynamic(sum),
+//             Schedule::Chunk(size) => self.inner.sched_chunk(sum, size),
+//             Schedule::Guided => self.inner.sched_guided(sum),
+//             Schedule::WorkStealing => self.inner.sched_work_stealing(sum),
+//         }
+//     }
+
+//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+//         self.inner.data.team.clone()
+//     }
+// }
+
+impl DistIteratorLauncher for UnsafeArrayInner {
+    // type Inner = Self;
     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
         // println!("global index cs:{:?}",chunk_size);
         if chunk_size == 1 {
-            self.inner.global_index_from_local(index)
+            self.global_index_from_local(index)
         } else {
-            Some(self.inner.global_index_from_local(index * chunk_size)? / chunk_size)
+            Some(self.global_index_from_local(index * chunk_size)? / chunk_size)
         }
     }
 
     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
         if chunk_size == 1 {
-            self.inner.subarray_index_from_local(index)
+            self.subarray_index_from_local(index)
         } else {
-            Some(self.inner.subarray_index_from_local(index * chunk_size)? / chunk_size)
+            Some(self.subarray_index_from_local(index * chunk_size)? / chunk_size)
         }
     }
 
@@ -61,7 +323,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
             iter: iter.iter_clone(Sealed),
             op,
         };
-        self.barrier();
+        self.team().barrier();
         match sched {
             Schedule::Static => self.sched_static(for_each),
             Schedule::Dynamic => self.sched_dynamic(for_each),
@@ -95,7 +357,7 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
             iter: iter.iter_clone(Sealed),
             op,
         };
-        self.barrier();
+        self.team().barrier();
         match sched {
             Schedule::Static => self.sched_static(for_each),
             Schedule::Dynamic => self.sched_dynamic(for_each),
@@ -257,6 +519,6 @@ impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {
     }
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.inner.data.team.clone()
+        self.data.team.clone()
     }
 }
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 9da52276..1b433afc 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -54,11 +54,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             op,
         };
         match sched {
-            Schedule::Static => self.sched_static(for_each),
-            Schedule::Dynamic => self.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-            Schedule::Guided => self.sched_guided(for_each),
-            Schedule::WorkStealing => self.sched_work_stealing(for_each),
+            Schedule::Static => self.inner.sched_static(for_each),
+            Schedule::Dynamic => self.inner.sched_dynamic(for_each),
+            Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
+            Schedule::Guided => self.inner.sched_guided(for_each),
+            Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
         }
     }
 
@@ -87,11 +87,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             op: op.clone(),
         };
         match sched {
-            Schedule::Static => self.sched_static(for_each),
-            Schedule::Dynamic => self.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-            Schedule::Guided => self.sched_guided(for_each),
-            Schedule::WorkStealing => self.sched_work_stealing(for_each),
+            Schedule::Static => self.inner.sched_static(for_each),
+            Schedule::Dynamic => self.inner.sched_dynamic(for_each),
+            Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
+            Schedule::Guided => self.inner.sched_guided(for_each),
+            Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
         }
     }
 
@@ -120,11 +120,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             op,
         };
         match sched {
-            Schedule::Static => self.sched_static(reduce),
-            Schedule::Dynamic => self.sched_dynamic(reduce),
-            Schedule::Chunk(size) => self.sched_chunk(reduce, size),
-            Schedule::Guided => self.sched_guided(reduce),
-            Schedule::WorkStealing => self.sched_work_stealing(reduce),
+            Schedule::Static => self.inner.sched_static(reduce),
+            Schedule::Dynamic => self.inner.sched_dynamic(reduce),
+            Schedule::Chunk(size) => self.inner.sched_chunk(reduce, size),
+            Schedule::Guided => self.inner.sched_guided(reduce),
+            Schedule::WorkStealing => self.inner.sched_work_stealing(reduce),
         }
     }
 
@@ -154,11 +154,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             _phantom: PhantomData,
         };
         match sched {
-            Schedule::Static => self.sched_static(collect),
-            Schedule::Dynamic => self.sched_dynamic(collect),
-            Schedule::Chunk(size) => self.sched_chunk(collect, size),
-            Schedule::Guided => self.sched_guided(collect),
-            Schedule::WorkStealing => self.sched_work_stealing(collect),
+            Schedule::Static => self.inner.sched_static(collect),
+            Schedule::Dynamic => self.inner.sched_dynamic(collect),
+            Schedule::Chunk(size) => self.inner.sched_chunk(collect, size),
+            Schedule::Guided => self.inner.sched_guided(collect),
+            Schedule::WorkStealing => self.inner.sched_work_stealing(collect),
         }
     }
 
@@ -177,11 +177,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             iter: iter.iter_clone(Sealed),
         };
         match sched {
-            Schedule::Static => self.sched_static(count),
-            Schedule::Dynamic => self.sched_dynamic(count),
-            Schedule::Chunk(size) => self.sched_chunk(count, size),
-            Schedule::Guided => self.sched_guided(count),
-            Schedule::WorkStealing => self.sched_work_stealing(count),
+            Schedule::Static => self.inner.sched_static(count),
+            Schedule::Dynamic => self.inner.sched_dynamic(count),
+            Schedule::Chunk(size) => self.inner.sched_chunk(count, size),
+            Schedule::Guided => self.inner.sched_guided(count),
+            Schedule::WorkStealing => self.inner.sched_work_stealing(count),
         }
     }
 
@@ -202,11 +202,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             iter: iter.iter_clone(Sealed),
         };
         match sched {
-            Schedule::Static => self.sched_static(sum),
-            Schedule::Dynamic => self.sched_dynamic(sum),
-            Schedule::Chunk(size) => self.sched_chunk(sum, size),
-            Schedule::Guided => self.sched_guided(sum),
-            Schedule::WorkStealing => self.sched_work_stealing(sum),
+            Schedule::Static => self.inner.sched_static(sum),
+            Schedule::Dynamic => self.inner.sched_dynamic(sum),
+            Schedule::Chunk(size) => self.inner.sched_chunk(sum, size),
+            Schedule::Guided => self.inner.sched_guided(sum),
+            Schedule::WorkStealing => self.inner.sched_work_stealing(sum),
         }
     }
 
diff --git a/src/lib.rs b/src/lib.rs
index 318f8119..eac1a680 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -139,7 +139,7 @@
 //!     block_array.wait_all();
 //!     block_array.barrier();
 //!     if my_pe == 0{
-//!         for (i,elem) in block_array.onesided_iter().into_iter().enumerate(){ //iterate through entire array on pe 0 (automatically transfering remote data)
+//!         for (i,elem) in block_onesided_iter!($array,array).into_iter().enumerate(){ //iterate through entire array on pe 0 (automatically transfering remote data)
 //!             println!("i: {} = {})",i,elem);
 //!         }
 //!     }
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index a7e58549..cef0d7f1 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -56,6 +56,15 @@ macro_rules! check_val {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! add_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -96,7 +105,7 @@ macro_rules! add_test{
             array.wait_all();
             array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -124,7 +133,7 @@ macro_rules! add_test{
             array.wait_all();
             array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{ array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{ onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -134,7 +143,7 @@ macro_rules! add_test{
             if !success{
                 array.print()
             }
-            // let sum = array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize);
+            // let sum = onesided_iter!($array,array).into_iter().fold(0,|acc,x| acc+ *x as usize);
             // let tot_updates = indices.len()/10 * max_val as usize;
             // check_val!($array,sum,tot_updates,success);
             // if !success{
@@ -160,7 +169,7 @@ macro_rules! add_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe { sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe { onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -185,7 +194,7 @@ macro_rules! add_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in  unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in  unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -195,7 +204,7 @@ macro_rules! add_test{
             if !success{
                 array.print()
             }
-            // let sum = sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize);
+            // let sum = onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize);
             // let tot_updates = indices.len()/10 * max_val as usize;
             // check_val!($array,sum,tot_updates,success);
             // if !success{
@@ -222,7 +231,7 @@ macro_rules! add_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
@@ -247,7 +256,7 @@ macro_rules! add_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
@@ -257,7 +266,7 @@ macro_rules! add_test{
                 if !success{
                     array.print()
                 }
-                // let sum = sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize);
+                // let sum = onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize);
                 // let tot_updates = indices.len()/10 * max_val as usize;
                 // check_val!($array,sum,tot_updates,success);
                 // if !success{
@@ -283,7 +292,7 @@ macro_rules! check_results {
         $array.wait_all();
         $array.barrier();
         #[allow(unused_unsafe)]
-        for (i, elem) in unsafe { $array.onesided_iter().into_iter().enumerate() } {
+        for (i, elem) in unsafe {onesided_iter!($array_ty,$array).into_iter().enumerate() } {
             let val = *elem;
             check_val!($array_ty, val, $num_pes, success);
             if !success {
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index 04968867..da85201c 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -58,6 +58,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+        $array.onesided_iter()
+    };
+}
+
 macro_rules! div_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -86,7 +95,7 @@ macro_rules! div_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -112,7 +121,7 @@ macro_rules! div_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -138,7 +147,7 @@ macro_rules! div_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 930ef56f..04ec3ce8 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -82,6 +82,26 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter {
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+        $array.onesided_iter()
+    };
+}
+
+macro_rules! buffered_onesided_iter {
+    (GlobalLockArray,$array:ident) => {
+        $array
+            .blocking_read_lock()
+            .buffered_onesided_iter($array.len())
+    };
+    ($arraytype:ident,$array:ident) => {
+        $array.buffered_onesided_iter($array.len())
+    };
+}
+
 macro_rules! fetch_add_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -120,7 +140,7 @@ macro_rules! fetch_add_test{
             // array.wait_all();
             array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{ array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{ onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -144,7 +164,7 @@ macro_rules! fetch_add_test{
             }
             array.barrier();
             #[allow(unused_unsafe)]
-            let sum = unsafe{array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+            let sum = unsafe{onesided_iter!($array,array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
             let tot_updates = num_updates * num_pes;
             check_val!($array,sum,tot_updates,success);
             if !success{
@@ -181,7 +201,7 @@ macro_rules! fetch_add_test{
             }
             array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{ sub_array.onesided_iter().into_iter().enumerate()} {
+            for (i,elem) in unsafe{ onesided_iter!($array,sub_array).into_iter().enumerate()} {
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -204,7 +224,7 @@ macro_rules! fetch_add_test{
             }
             array.barrier();
             #[allow(unused_unsafe)]
-            let sum = unsafe {sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+            let sum = unsafe {onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
             let tot_updates = num_updates * num_pes;
             check_val!($array,sum,tot_updates,success);
             if !success{
@@ -244,7 +264,7 @@ macro_rules! fetch_add_test{
                 }
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
@@ -267,7 +287,7 @@ macro_rules! fetch_add_test{
                 }
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                let sum = unsafe{sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+                let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
                 let tot_updates = num_updates * num_pes;
                 check_val!($array,sum,tot_updates,success);
                 if !success{
@@ -358,7 +378,7 @@ macro_rules! check_results {
         }
         // println!("here");
         #[allow(unused_unsafe)]
-        for (i, elem) in unsafe { $array.buffered_onesided_iter($array.len()).into_iter().enumerate() }{
+        for (i, elem) in unsafe { buffered_onesided_iter!($array_ty,$array).into_iter().enumerate() }{
             let val = *elem;
             let real_val = if $real_val == 0  {
                 i + $num_pes
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index bf3e379a..57b46924 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -84,6 +84,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! fetch_div_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -123,7 +132,7 @@ macro_rules! fetch_div_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -161,7 +170,7 @@ macro_rules! fetch_div_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -199,7 +208,7 @@ macro_rules! fetch_div_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index c789d97c..900c0d2f 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -81,6 +81,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! fetch_mul_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -119,7 +128,7 @@ macro_rules! fetch_mul_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -156,7 +165,7 @@ macro_rules! fetch_mul_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -192,7 +201,7 @@ macro_rules! fetch_mul_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
diff --git a/tests/array/arithmetic_ops/fetch_rem_test.rs b/tests/array/arithmetic_ops/fetch_rem_test.rs
index 32d6d4c1..1f0801c7 100644
--- a/tests/array/arithmetic_ops/fetch_rem_test.rs
+++ b/tests/array/arithmetic_ops/fetch_rem_test.rs
@@ -84,6 +84,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! fetch_rem_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -123,7 +132,7 @@ macro_rules! fetch_rem_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -161,7 +170,7 @@ macro_rules! fetch_rem_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -199,7 +208,7 @@ macro_rules! fetch_rem_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index a1615ce1..da690674 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -78,6 +78,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! fetch_sub_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -117,7 +126,7 @@ macro_rules! fetch_sub_test{
             }
             array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
@@ -144,7 +153,7 @@ macro_rules! fetch_sub_test{
 
             array.barrier();
             #[allow(unused_unsafe)]
-            let sum = unsafe {array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+            let sum = unsafe {onesided_iter!($array,array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
             let calced_sum = tot_updates as usize  * (array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
@@ -180,7 +189,7 @@ macro_rules! fetch_sub_test{
             }
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
@@ -205,7 +214,7 @@ macro_rules! fetch_sub_test{
             }
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            let sum = unsafe{sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+            let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
             let calced_sum = tot_updates as usize  * (sub_array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
@@ -242,7 +251,7 @@ macro_rules! fetch_sub_test{
                 }
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,zero,success);
                     if !success{
@@ -267,7 +276,7 @@ macro_rules! fetch_sub_test{
 
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                let sum = unsafe{sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+                let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
                 let calced_sum = tot_updates as usize  * (sub_array.len()-1);
                 check_val!($array,sum,calced_sum,success);
                 if !success{
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index 1f62f176..4ae33385 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -66,6 +66,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! mul_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -93,7 +102,7 @@ macro_rules! mul_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -119,7 +128,7 @@ macro_rules! mul_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,max_val,success);
                 if !success{
@@ -145,7 +154,7 @@ macro_rules! mul_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,max_val,success);
                     if !success{
diff --git a/tests/array/arithmetic_ops/rem_test.rs b/tests/array/arithmetic_ops/rem_test.rs
index 62faabc8..5eb1a9b1 100644
--- a/tests/array/arithmetic_ops/rem_test.rs
+++ b/tests/array/arithmetic_ops/rem_test.rs
@@ -58,6 +58,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! rem_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -86,7 +95,7 @@ macro_rules! rem_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -112,7 +121,7 @@ macro_rules! rem_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,one,success);
                 if !success{
@@ -138,7 +147,7 @@ macro_rules! rem_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,one,success);
                     if !success{
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index 8bb3b8bc..899d1b68 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -60,6 +60,15 @@ macro_rules! max_updates {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! sub_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -91,7 +100,7 @@ macro_rules! sub_test{
             array.wait_all();
             array.barrier();
                         #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
@@ -112,7 +121,7 @@ macro_rules! sub_test{
             array.wait_all();
             array.barrier();
                         #[allow(unused_unsafe)]
-            let sum = unsafe{array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+            let sum = unsafe{onesided_iter!($array,array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
             let calced_sum = tot_updates as usize  * (array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
@@ -139,7 +148,7 @@ macro_rules! sub_test{
             sub_array.wait_all();
             sub_array.barrier();
                         #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,zero,success);
                 if !success{
@@ -160,7 +169,7 @@ macro_rules! sub_test{
             sub_array.wait_all();
             sub_array.barrier();
                         #[allow(unused_unsafe)]
-            let sum = unsafe {sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+            let sum = unsafe {onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
             let calced_sum = tot_updates as usize  * (sub_array.len()-1);
             check_val!($array,sum,calced_sum,success);
             if !success{
@@ -187,7 +196,7 @@ macro_rules! sub_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,zero,success);
                     if !success{
@@ -208,7 +217,7 @@ macro_rules! sub_test{
                 sub_array.wait_all();
                 sub_array.barrier();
                                 #[allow(unused_unsafe)]
-                let sum = unsafe{sub_array.onesided_iter().into_iter().fold(0,|acc,x| acc+ *x as usize)};
+                let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
                 let calced_sum = tot_updates as usize  * (sub_array.len()-1);
                 check_val!($array,sum,calced_sum,success);
                 if !success{
diff --git a/tests/array/atomic_ops/compare_exchange_test.rs b/tests/array/atomic_ops/compare_exchange_test.rs
index d3047250..d2173a17 100644
--- a/tests/array/atomic_ops/compare_exchange_test.rs
+++ b/tests/array/atomic_ops/compare_exchange_test.rs
@@ -209,6 +209,8 @@ macro_rules! compare_exchange_test{
     }
 }
 
+
+
 macro_rules! compare_exchange_epsilon_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index 1bdbfd9c..8d300a18 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -47,6 +47,15 @@ macro_rules! check_val {
     };
 }
 
+// macro_rules! onesided_iter{
+//     (GlobalLockArray,$array:ident) => {
+//         $array.blocking_read_lock().onesided_iter()
+//     };
+//     ($arraytype:ident,$array:ident) => {
+//        $array.onesided_iter()
+//     };
+// }
+
 macro_rules! load_store_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
diff --git a/tests/array/atomic_ops/swap_test.rs b/tests/array/atomic_ops/swap_test.rs
index 969f30a6..bef4220b 100644
--- a/tests/array/atomic_ops/swap_test.rs
+++ b/tests/array/atomic_ops/swap_test.rs
@@ -47,6 +47,8 @@ macro_rules! check_val {
     };
 }
 
+
+
 macro_rules! swap{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 6b3bb5b8..68e740dc 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -49,6 +49,17 @@ macro_rules! check_val {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+        $array.onesided_iter()
+    };
+}
+
+
+
 macro_rules! and_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -74,7 +85,7 @@ macro_rules! and_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -101,7 +112,7 @@ macro_rules! and_test{
             sub_array.barrier();
             // sub_array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -129,7 +140,7 @@ macro_rules! and_test{
                 sub_array.barrier();
                 // sub_array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
diff --git a/tests/array/bitwise_ops/fetch_and_test.rs b/tests/array/bitwise_ops/fetch_and_test.rs
index 41b3faad..c41007bb 100644
--- a/tests/array/bitwise_ops/fetch_and_test.rs
+++ b/tests/array/bitwise_ops/fetch_and_test.rs
@@ -49,6 +49,15 @@ macro_rules! check_val {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! fetch_and_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -83,7 +92,7 @@ macro_rules! fetch_and_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -120,7 +129,7 @@ macro_rules! fetch_and_test{
             sub_array.barrier();
             // sub_array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -158,7 +167,7 @@ macro_rules! fetch_and_test{
                 sub_array.barrier();
                 // sub_array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
diff --git a/tests/array/bitwise_ops/fetch_or_test.rs b/tests/array/bitwise_ops/fetch_or_test.rs
index 5b648f42..183ab086 100644
--- a/tests/array/bitwise_ops/fetch_or_test.rs
+++ b/tests/array/bitwise_ops/fetch_or_test.rs
@@ -49,6 +49,15 @@ macro_rules! check_val {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! fetch_or_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -83,7 +92,7 @@ macro_rules! fetch_or_test{
             array.barrier();
             // array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -120,7 +129,7 @@ macro_rules! fetch_or_test{
             sub_array.barrier();
             // sub_array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -158,7 +167,7 @@ macro_rules! fetch_or_test{
                 sub_array.barrier();
                 // sub_array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
diff --git a/tests/array/bitwise_ops/fetch_xor_test.rs b/tests/array/bitwise_ops/fetch_xor_test.rs
index edcfe642..1c5bcfb6 100644
--- a/tests/array/bitwise_ops/fetch_xor_test.rs
+++ b/tests/array/bitwise_ops/fetch_xor_test.rs
@@ -49,6 +49,15 @@ macro_rules! check_val {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! fetch_xor_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -83,7 +92,7 @@ macro_rules! fetch_xor_test{
             array.barrier();
             // array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -120,7 +129,7 @@ macro_rules! fetch_xor_test{
             sub_array.barrier();
             // sub_array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -158,7 +167,7 @@ macro_rules! fetch_xor_test{
                 sub_array.barrier();
                 // sub_array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index 5d8a8f25..83e19d61 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -49,6 +49,15 @@ macro_rules! check_val {
     };
 }
 
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! or_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -74,7 +83,7 @@ macro_rules! or_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -101,7 +110,7 @@ macro_rules! or_test{
             sub_array.barrier();
             // sub_array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -129,7 +138,7 @@ macro_rules! or_test{
                 sub_array.barrier();
                 // sub_array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
diff --git a/tests/array/bitwise_ops/xor_test.rs b/tests/array/bitwise_ops/xor_test.rs
index 6d93284a..2a10eee8 100644
--- a/tests/array/bitwise_ops/xor_test.rs
+++ b/tests/array/bitwise_ops/xor_test.rs
@@ -49,6 +49,16 @@ macro_rules! check_val {
     };
 }
 
+
+macro_rules! onesided_iter{
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+       $array.onesided_iter()
+    };
+}
+
 macro_rules! xor_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -74,7 +84,7 @@ macro_rules! xor_test{
             array.barrier();
             // array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -101,7 +111,7 @@ macro_rules! xor_test{
             sub_array.barrier();
             // sub_array.print();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{sub_array.onesided_iter().into_iter().enumerate()}{
+            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
                 let val = *elem;
                 check_val!($array,val,final_val,success);
                 if !success{
@@ -129,7 +139,7 @@ macro_rules! xor_test{
                 sub_array.barrier();
                 // sub_array.print();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate()}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
                     let val = *elem;
                     check_val!($array,val,final_val,success);
                     if !success{
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index 22aa8185..204a8d81 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -32,6 +32,15 @@ macro_rules! initialize_array {
     };
 }
 
+macro_rules! onesided_iter {
+    (GlobalLockArray,$array:ident) => {
+        $array.blocking_read_lock().onesided_iter()
+    };
+    ($arraytype:ident,$array:ident) => {
+        $array.onesided_iter()
+    };
+}
+
 macro_rules! put_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
@@ -62,7 +71,7 @@ macro_rules! put_test{
                 array.wait_all();
                 array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe { array.onesided_iter().into_iter().enumerate().take( num_txs * tx_size) }{
+                for (i,elem) in unsafe { onesided_iter!($array,array).into_iter().enumerate().take( num_txs * tx_size) }{
                     if ((i as $t - *elem) as f32).abs() > 0.0001 {
                         eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                         success = false;
@@ -96,7 +105,7 @@ macro_rules! put_test{
                 array.wait_all();
                 sub_array.barrier();
                 #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate().take( num_txs * tx_size)}{
+                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate().take( num_txs * tx_size)}{
                     if ((i as $t - *elem) as f32).abs() > 0.0001 {
                         eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                         success = false;
@@ -133,7 +142,7 @@ macro_rules! put_test{
                     array.wait_all();
                     sub_array.barrier();
                     #[allow(unused_unsafe)]
-                    for (i,elem) in unsafe {sub_array.onesided_iter().into_iter().enumerate().take( num_txs * tx_size)}{
+                    for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate().take( num_txs * tx_size)}{
                         if ((i as $t - *elem) as f32).abs() > 0.0001 {
                             eprintln!("{:?} {:?} {:?}",i as $t,*elem,((i as $t - *elem) as f32).abs());
                             success = false;

From b7b3ac0b9e766ee0259ab18f690e796a6136ce59 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 22 Jul 2024 14:45:17 -0700
Subject: [PATCH 051/116] refactor distributed iterators to expose blocking
 iterator api

---
 .../array_examples/distributed_iteration.rs   |  71 +-
 src/array.rs                                  |  37 +-
 src/array/atomic.rs                           |  11 +-
 src/array/generic_atomic/iteration.rs         | 343 +------
 src/array/global_lock_atomic/iteration.rs     | 343 +------
 src/array/iterator/consumer.rs                |  20 -
 src/array/iterator/distributed_iterator.rs    | 858 ++++++++++++------
 .../iterator/distributed_iterator/consumer.rs | 265 ------
 .../distributed_iterator/consumer/collect.rs  | 149 ++-
 .../distributed_iterator/consumer/count.rs    | 113 ++-
 .../distributed_iterator/consumer/for_each.rs | 429 ++-------
 .../distributed_iterator/consumer/reduce.rs   | 265 +++++-
 .../distributed_iterator/consumer/sum.rs      | 122 ++-
 src/array/iterator/local_iterator.rs          |  95 +-
 src/array/local_lock_atomic/iteration.rs      | 345 +------
 src/array/native_atomic/iteration.rs          | 341 +------
 src/array/read_only/iteration.rs              | 342 +------
 src/array/unsafe.rs                           |   5 +
 src/array/unsafe/iteration/distributed.rs     | 820 ++++++++---------
 src/array/unsafe/iteration/local.rs           |  76 +-
 src/barrier.rs                                | 421 +++++++--
 src/lamellar_team.rs                          |   2 +-
 tests/array/arithmetic_ops/add_test.rs        |  32 +-
 23 files changed, 2180 insertions(+), 3325 deletions(-)

diff --git a/examples/array_examples/distributed_iteration.rs b/examples/array_examples/distributed_iteration.rs
index f75b1217..65fb6af1 100644
--- a/examples/array_examples/distributed_iteration.rs
+++ b/examples/array_examples/distributed_iteration.rs
@@ -21,16 +21,10 @@ fn main() {
     // we currently provide the "for_each" driver which will execute a closure on every element in the distributed array (concurrently)
 
     //for example lets initialize our arrays, where we store the value of my_pe to each local element a pe owns
-    let _ = block_dist_iter
+    block_dist_iter
         .enumerate()
-        .for_each(move |(i, elem)| elem.store(i));
-    let _ = cyclic_dist_iter.for_each(move |elem| elem.store(my_pe));
-    //for_each is asynchronous so we must wait on the array for the operations to complete
-    // we are working on providing a request handle which can be used to check for completion
-    block_array.wait_all();
-    block_array.barrier();
-    cyclic_array.wait_all();
-    cyclic_array.barrier();
+        .blocking_for_each(move |(i, elem)| elem.store(i));
+    cyclic_dist_iter.blocking_for_each(move |elem| elem.store(my_pe));
 
     // let block_array = block_array.into_read_only();
     block_array.print();
@@ -54,12 +48,12 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate step_by");
-    let _ = block_array
+    block_array
         .dist_iter()
         .skip(2)
         .enumerate()
         .step_by(3)
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -68,8 +62,6 @@ fn main() {
                 elem
             )
         });
-    block_array.wait_all();
-    block_array.barrier();
 
     // println!("zip ");
     // block_array
@@ -91,11 +83,11 @@ fn main() {
     println!("--------------------------------------------------------");
     println!("cyclic skip enumerate");
 
-    let _ = cyclic_array
+    cyclic_array
         .dist_iter()
         .enumerate()
         .skip(2)
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -104,8 +96,6 @@ fn main() {
                 elem
             )
         });
-    cyclic_array.wait_all();
-    cyclic_array.barrier();
 
     println!("--------------------------------------------------------");
 
@@ -130,7 +120,7 @@ fn main() {
     println!("cyclic enumerate map async for each");
     cyclic_array.print();
     let barray = block_array.clone();
-    let _ = cyclic_array
+    cyclic_array
         .dist_iter()
         .enumerate()
         .map(move |(i, elem)| {
@@ -144,7 +134,7 @@ fn main() {
             );
             async move { (i, elem, barray.load(i).await) }
         })
-        .for_each_async(move |i| async move {
+        .blocking_for_each_async(move |i| async move {
             println!(
                 "[pe({:?})-{:?}] for each {:?}",
                 my_pe,
@@ -152,8 +142,6 @@ fn main() {
                 i.await
             );
         });
-    cyclic_array.wait_all();
-    cyclic_array.barrier();
     block_array.print();
 
     println!("--------------------------------------------------------");
@@ -172,17 +160,16 @@ fn main() {
             })
             .collect_async::<ReadOnlyArray<usize>, _>(Distribution::Block),
     );
-    cyclic_array.barrier();
     new_array.print();
     block_array.print();
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter");
-    let _ = block_array
+    block_array
         .dist_iter()
         .enumerate()
         .filter(|(_, elem)| elem.load() % 4 == 0)
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -191,12 +178,10 @@ fn main() {
                 elem
             )
         });
-    block_array.wait_all();
-    block_array.barrier();
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter_map");
-    let _ = block_array
+    block_array
         .dist_iter()
         .enumerate()
         .filter_map(|(i, elem)| {
@@ -206,7 +191,7 @@ fn main() {
                 None
             }
         })
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -215,8 +200,6 @@ fn main() {
                 elem
             )
         });
-    block_array.wait_all();
-    block_array.barrier();
     println!("--------------------------------------------------------");
     println!("filter_map collect");
     let new_block_array = block_array.block_on(
@@ -238,11 +221,11 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate");
-    let _ = block_array
+    block_array
         .dist_iter()
         .skip(10)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -252,17 +235,14 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
-    block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("block skip  step_by enumerate");
-    let _ = block_array
+    block_array
         .dist_iter()
         .skip(10)
         .step_by(3)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -272,17 +252,14 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
-    block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("block take skip enumerate");
-    let _ = block_array
+    block_array
         .dist_iter()
         .take(60)
         .skip(10)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -292,18 +269,15 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
-    block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("block take skip take enumerate");
-    let _ = block_array
+    block_array
         .dist_iter()
         .take(60)
         .skip(10)
         .take(30)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -313,9 +287,6 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
-    block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("block filter count");
     let count = block_array.block_on(
diff --git a/src/array.rs b/src/array.rs
index 0c48ef39..143b45a6 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -187,28 +187,28 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
-// impl Dist for bool {}
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+impl Dist for bool {}
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-lamellar_impl::generate_ops_for_bool_rt!();
+// lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
@@ -818,9 +818,8 @@ pub trait InnerArray: Sized {
 pub(crate) mod private {
     use crate::active_messaging::*;
     use crate::array::{
-        AtomicArray, GlobalLockArray, LamellarByteArray,
-        /*NativeAtomicArray, GenericAtomicArray,*/ LamellarReadArray, LamellarWriteArray,
-        LocalLockArray, ReadOnlyArray, UnsafeArray,
+        AtomicArray, GenericAtomicArray, GlobalLockArray, LamellarByteArray, LamellarReadArray,
+        LamellarWriteArray, LocalLockArray, NativeAtomicArray, ReadOnlyArray, UnsafeArray,
     };
     use crate::memregion::Dist;
     use crate::LamellarTeamRT;
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index e58d6687..e8ff5ef6 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -3,7 +3,8 @@ pub(crate) mod operations;
 pub(crate) mod rdma;
 
 use crate::array::generic_atomic::{GenericAtomicElement, LocalGenericAtomicElement};
-
+use crate::array::iterator::distributed_iterator::DistIteratorLauncher;
+use crate::array::iterator::local_iterator::LocalIteratorLauncher;
 use crate::array::native_atomic::NativeAtomicElement;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
@@ -510,7 +511,7 @@ impl<T: Dist + std::fmt::Debug> std::fmt::Debug for AtomicElement<T> {
 /// as such there can be many concurrent threads modifying the array at any given time.
 ///
 /// Generally any operation on this array type will be performed via an internal runtime Active Message, i.e. direct RDMA operations are not allowed
-#[enum_dispatch(LamellarArray<T>,LamellarEnv,LamellarArrayInternalGet<T>,LamellarArrayInternalPut<T>,ArrayExecAm<T>,LamellarArrayPrivate<T>,DistIteratorLauncher,LocalIteratorLauncher)]
+#[enum_dispatch(LamellarArray<T>,LamellarEnv,LamellarArrayInternalGet<T>,LamellarArrayInternalPut<T>,ArrayExecAm<T>,LamellarArrayPrivate<T>)]
 // #[enum_dispatch(LamellarArray<T>,LamellarEnv,LamellarArrayInternalGet<T>,LamellarArrayInternalPut<T>,ArrayExecAm<T>,LamellarArrayPrivate<T>)]
 #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
 #[serde(bound = "T: Dist + serde::Serialize + serde::de::DeserializeOwned + 'static")]
@@ -521,11 +522,9 @@ pub enum AtomicArray<T: Dist> {
     GenericAtomicArray(GenericAtomicArray<T>),
 }
 
-// impl<T: Dist> DistIteratorLauncher for AtomicArray<T> {
-//     // type Inner = Self;
-// }
+impl<T: Dist> DistIteratorLauncher for AtomicArray<T> {}
 
-// impl<T: Dist> LocalIteratorLauncher for AtomicArray<T> {}
+impl<T: Dist> LocalIteratorLauncher for AtomicArray<T> {}
 
 impl<T: Dist + 'static> crate::active_messaging::DarcSerde for AtomicArray<T> {
     fn ser(&self, num_pes: usize, darcs: &mut Vec<RemotePtr>) {
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index 3a3f2080..e6fbab98 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -3,9 +3,7 @@ use crate::array::generic_atomic::*;
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
-use crate::array::iterator::{
-    private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
-};
+use crate::array::iterator::{private::*, LamellarArrayIterators, LamellarArrayMutIterators};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
@@ -219,342 +217,5 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GenericAtomicArray<T> {
 }
 
 impl<T: Dist> DistIteratorLauncher for GenericAtomicArray<T> {}
-//     // type Inner = Self;
-//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.global_index_from_local(index, chunk_size)
-//     }
 
-//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.subarray_index_from_local(index, chunk_size)
-//     }
-
-//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-//     // }
-
-//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each(&self.array, iter, op)
-//     }
-//     fn for_each_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-//     }
-//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
-//     }
-//     fn for_each_async_with_schedule<I, F, Fut>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce(&self.array, iter, op)
-//     }
-
-//     fn reduce_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect(&self.array, iter, d)
-//     }
-
-//     fn collect_with_schedule<I, A>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-//     }
-//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async(&self.array, iter, d)
-//     }
-
-//     fn collect_async_with_schedule<I, A, B>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-//     }
-
-//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count(&self.array, iter)
-//     }
-
-//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum(&self.array, iter)
-//     }
-
-//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.array.team_rt().clone()
-//     }
-// }
-
-impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {
-    fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.local_global_index_from_local(index, chunk_size)
-    }
-
-    fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array
-            .local_subarray_index_from_local(index, chunk_size)
-    }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    // fn reduce_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async(iter, op)
-    // }
-
-    // fn reduce_async_with_schedule<I, F, Fut>(&self, sched: Schedule, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async_with_schedule(sched, iter, op)
-    // }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-
-    // fn collect_async<I, A, B>(
-    //     &self,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async(iter, d)
-    // }
-
-    // fn collect_async_with_schedule<I, A, B>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async_with_schedule(sched, iter, d)
-    // }
-
-    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> LocalIteratorLauncher for GenericAtomicArray<T> {}
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 8ca22109..193984f6 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -5,7 +5,7 @@ use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
     private::{IterClone, Sealed},
-    LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
+    LamellarArrayIterators, LamellarArrayMutIterators,
 };
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
@@ -414,342 +414,5 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
 }
 
 impl<T: Dist> DistIteratorLauncher for GlobalLockArray<T> {}
-//     // type Inner = Self;
-//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.global_index_from_local(index, chunk_size)
-//     }
-
-//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.subarray_index_from_local(index, chunk_size)
-//     }
-
-//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-//     // }
-
-//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each(&self.array, iter, op)
-//     }
-//     fn for_each_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-//     }
-//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
-//     }
-//     fn for_each_async_with_schedule<I, F, Fut>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce(&self.array, iter, op)
-//     }
-
-//     fn reduce_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect(&self.array, iter, d)
-//     }
-
-//     fn collect_with_schedule<I, A>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-//     }
-//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async(&self.array, iter, d)
-//     }
-
-//     fn collect_async_with_schedule<I, A, B>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-//     }
-
-//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count(&self.array, iter)
-//     }
-
-//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum(&self.array, iter)
-//     }
-
-//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.array.team_rt().clone()
-//     }
-// }
-
-impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {
-    fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.local_global_index_from_local(index, chunk_size)
-    }
-
-    fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array
-            .local_subarray_index_from_local(index, chunk_size)
-    }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    // fn reduce_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async(iter, op)
-    // }
-
-    // fn reduce_async_with_schedule<I, F, Fut>(&self, sched: Schedule, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async_with_schedule(sched, iter, op)
-    // }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-
-    // fn collect_async<I, A, B>(
-    //     &self,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async(iter, d)
-    // }
-
-    // fn collect_async_with_schedule<I, A, B>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async_with_schedule(sched, iter, d)
-    // }
-
-    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+
+impl<T: Dist> LocalIteratorLauncher for GlobalLockArray<T> {}
diff --git a/src/array/iterator/consumer.rs b/src/array/iterator/consumer.rs
index 35589d04..83150174 100644
--- a/src/array/iterator/consumer.rs
+++ b/src/array/iterator/consumer.rs
@@ -92,22 +92,6 @@ impl IterSchedule {
             }
         }
     }
-    // pub(crate) fn monotonic_iter<I: MonotonicIterConsumer,J: IterConsumer>(&self, iter: I) -> IterScheduleIter<I> {
-    //     match self {
-    //         IterSchedule::Static(start, end) => {
-    //             IterScheduleIter::Static(iter.monotonic::<J>().init(*start,end-start))
-    //         }
-    //         IterSchedule::Dynamic(cur_i, max_i) => {
-    //             IterScheduleIter::Dynamic(iter.monotonic::<J>(), cur_i.clone(), *max_i)
-    //         }
-    //         IterSchedule::Chunk(ranges, range_i) => {
-    //             IterScheduleIter::Chunk(iter.monotonic::<J>().init(0,0), ranges.clone(),range_i.clone())
-    //         }
-    //         IterSchedule::WorkStealing(range, siblings) => {
-    //             let (start, end) = *range.range.lock();
-    //             IterScheduleIter::WorkStealing(iter.monotonic::<J>().init(start, end-start), range.clone(), siblings.clone())            }
-    //     }
-    // }
 }
 
 pub(crate) enum IterScheduleIter<I: IterConsumer> {
@@ -194,7 +178,3 @@ pub(crate) trait IterConsumer: SyncSend {
     ) -> Self::Handle;
     fn max_elems(&self, in_elems: usize) -> usize;
 }
-
-// pub(crate) trait MonotonicIterConsumer: IterConsumer{
-//     fn monotonic<I: IterConsumer>(&self) -> I;
-// }
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 339f722b..01caf5cd 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -48,254 +48,99 @@ use crate::active_messaging::SyncSend;
 
 use enum_dispatch::enum_dispatch;
 use futures_util::Future;
+use paste::paste;
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 
-// //#[doc(hidden)]
-// pub struct DistIterForEachHandle {
-//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = ()>>>,
-// }
-
-// impl Drop for DistIterForEachHandle {
-//     fn drop(&mut self) {
-//         println!("dropping DistIterForEachHandle");
-//     }
-// }
-
-// //#[doc(hidden)]
-// #[async_trait]
-// impl IterRequest for DistIterForEachHandle {
-//     type Output = ();
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         for req in self.reqs.drain(..) {
-//             req.await;
-//         }
-//     }
-//     fn wait(mut self: Box<Self>) -> Self::Output {
-//         for req in self.reqs.drain(..) {
-//             req.blocking_wait();
-//         }
-//     }
-// }
-
-// //#[doc(hidden)]
-// pub struct DistIterCollectHandle<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> {
-//     pub(crate) reqs: Vec<Box<dyn LamellarRequest<Output = Vec<T>>>>,
-//     pub(crate) distribution: Distribution,
-//     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-//     pub(crate) _phantom: PhantomData<A>,
-// }
-
-// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> DistIterCollectHandle<T, A> {
-//     fn create_array(&self, local_vals: &Vec<T>) -> A {
-//         self.team.tasking_barrier();
-//         let local_sizes =
-//             UnsafeArray::<usize>::new(self.team.clone(), self.team.num_pes, Distribution::Block);
-//         unsafe {
-//             local_sizes.local_as_mut_slice()[0] = local_vals.len();
-//         }
-//         local_sizes.barrier();
-//         // local_sizes.print();
-//         let mut size = 0;
-//         let mut my_start = 0;
-//         let my_pe = self.team.team_pe.expect("pe not part of team");
-//         // local_sizes.print();
-//         unsafe {
-//             local_sizes
-//                 .onesided_iter()
-//                 .into_iter()
-//                 .enumerate()
-//                 .for_each(|(i, local_size)| {
-//                     size += local_size;
-//                     if i < my_pe {
-//                         my_start += local_size;
-//                     }
-//                 });
-//         }
-//         // println!("my_start {} size {}", my_start, size);
-//         let array = UnsafeArray::<T>::new(self.team.clone(), size, self.distribution); //implcit barrier
-
-//         // safe because only a single reference to array on each PE
-//         // we calculate my_start so that each pes local vals are guaranteed to not overwrite another pes values.
-//         unsafe { array.put(my_start, local_vals) };
-//         array.into()
-//     }
-// }
-// #[async_trait]
-// impl<T: Dist + ArrayOps, A: From<UnsafeArray<T>> + SyncSend> IterRequest
-//     for DistIterCollectHandle<T, A>
-// {
-//     type Output = A;
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         let mut local_vals = vec![];
-//         for req in self.reqs.drain(0..) {
-//             let v = req.await;
-//             local_vals.extend(v);
-//         }
-//         self.create_array(&local_vals)
-//     }
-//     fn wait(mut self: Box<Self>) -> Self::Output {
-//         let mut local_vals = vec![];
-//         for req in self.reqs.drain(0..) {
-//             let v = req.blocking_wait();
-//             local_vals.extend(v);
-//         }
-//         self.create_array(&local_vals)
-//     }
-// }
-
-#[doc(hidden)]
-#[enum_dispatch]
-pub trait DistIteratorLauncher: InnerArray {
-    // type Inner: InnerArray;
-    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-        Self: InnerArray,
-    {
-        // DistIteratorLauncher::for_each_with_schedule(self, Schedule::Static, iter, op)
-        self.as_inner().for_each(iter, op)
-    }
-
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        self.as_inner().for_each_with_schedule(sched, iter, op)
-    }
-
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.as_inner().for_each_async(iter, op)
-    }
-
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.as_inner()
-            .for_each_async_with_schedule(sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        self.as_inner().reduce(iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        self.as_inner().reduce_with_schedule(sched, iter, op)
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.as_inner().collect(iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.as_inner().collect_with_schedule(sched, iter, d)
-    }
-
-    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.as_inner().collect_async(iter, d)
-    }
-
-    fn collect_async_with_schedule<I, A, B>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.as_inner().collect_async_with_schedule(sched, iter, d)
-    }
+macro_rules! consumer_impl {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ; [$(-> $($blocking_ret:tt)*)? ]) => {
+        fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($return_type)*
+        where
+           $($bounds)+
+        {
+            self.as_inner().$name($($arg),*)
+        }
 
-    fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        self.as_inner().count(iter)
-    }
+        paste! {
+            fn [<$name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            ) ->  $($return_type)*
+            where
+                $($bounds)+
+            {
+                self.as_inner().[<$name _with_schedule>](sched, $($arg),*)
+            }
 
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        self.as_inner().count_with_schedule(sched, iter)
-    }
+            fn [<blocking_ $name >]<$($generics),*>(
+                &self,
+                $($arg : $arg_ty),*
+            )   $(-> $($blocking_ret)*)?
+            where
+                $($bounds)+
+            {
+                self.as_inner().[<blocking_ $name >]($($arg),*)
+            }
 
-    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        self.as_inner().sum(iter)
-    }
+            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            )  $(-> $($blocking_ret)*)?
+            where
+                $($bounds)+
+            {
+                self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
+            }
+        }
+    };
+}
 
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        self.as_inner().sum_with_schedule(sched, iter)
-    }
+#[doc(hidden)]
+pub trait DistIteratorLauncher: InnerArray {
+    consumer_impl!(
+        for_each<I, F>(iter: &I, op: F);
+        [DistIterForEachHandle];
+        [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
+        []
+    );
+    consumer_impl!(
+        for_each_async<I, F, Fut>(iter: &I, op: F); 
+        [DistIterForEachHandle];
+        [I: DistributedIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
+        []);
+
+    consumer_impl!(
+        reduce<I, F>(iter: &I, op: F); 
+        [DistIterReduceHandle<I::Item, F>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
+        [-> Option<I::Item>]);
+
+    consumer_impl!(
+        collect<I, A>(iter: &I, d: Distribution); 
+        [DistIterCollectHandle<I::Item, A>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static];
+        [-> A]);
+
+    consumer_impl!(
+        collect_async<I, A, B>(iter: &I, d: Distribution); 
+        [DistIterCollectHandle<B, A>];
+        [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
+        [-> A]);
+
+    consumer_impl!(
+        count<I>(iter: &I); 
+        [DistIterCountHandle];
+        [I: DistributedIterator + 'static ];
+        [-> usize]);
+
+    consumer_impl!(
+        sum<I>(iter: &I); 
+        [DistIterSumHandle<I::Item>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
+        [-> I::Item]);
 
     //#[doc(hidden)]
     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
@@ -323,9 +168,6 @@ pub trait DistIteratorLauncher: InnerArray {
         }
     }
 
-    // //#[doc(hidden)]
-    // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)>;
-
     //#[doc(hidden)]
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.as_inner().team()
@@ -523,6 +365,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///         .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     /// );
     ///```
+    #[must_use]
     fn for_each<F>(&self, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -530,6 +373,33 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each(self, op)
     }
 
+    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array
+    ///
+    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    ///
+    /// The iteration will have been completed by the time this function returns
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// array
+    ///     .dist_iter()
+    ///     .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
+    /// );
+    ///```
+    fn blocking_for_each<F>(&self, op: F) 
+    where
+        F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_for_each(self, op)
+    }
+
     /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
@@ -539,7 +409,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Each thread will only drive a single future at a time.
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
-    /// Note calling this function launches the iteration regardless of if the returned future is used or not.
     ///
     /// # Examples
     ///```
@@ -560,6 +429,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
+    #[must_use]
     fn for_each_async<F, Fut>(&self, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -568,12 +438,47 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed schedule policy.
+    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array
+    ///
+    /// The supplied closure must return a future.
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// Iteration is completed by the time this function returns
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// array.dist_iter().blocking_for_each_async(|elem| async move {
+    ///     async_std::task::yield_now().await;
+    ///     println!("{:?} {elem}",std::thread::current().id())
+    /// });
+    /// ```
+    /// essentially the for_each_async call gets converted into (on each thread)
+    ///```ignore
+    /// for fut in array.iter(){
+    ///     fut.await;
+    /// }
+    ///```
+    fn blocking_for_each_async<F, Fut>(&self, op: F)
+    where
+        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.array().blocking_for_each_async(self, op)
+    }
+
+    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
-    /// Note calling this function launches the iteration regardless of if the returned future is used or not.
     ///
     /// # Examples
     ///```
@@ -585,6 +490,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
     /// array.wait_all();
     ///```
+    #[must_use]
     fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -592,7 +498,29 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_with_schedule(sched, self, op)
     }
 
-    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
+    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array
+    ///
+    /// Iteration is completed by the time this function returns
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// array.dist_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
+    ///```
+    fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F)
+    where
+        F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_for_each_with_schedule(sched, self, op)
+    }
+
+    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
     /// as each PE will only process elements local to itself
@@ -617,6 +545,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// });
     /// array.wait_all();
     ///```
+    #[must_use]
     fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -625,9 +554,40 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async_with_schedule(sched, self, op)
     }
 
+    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
+    /// as each PE will only process elements local to itself
+    ///
+    /// The supplied closure must return a future.
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// Iteration is completed by the time this function returns
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// array.dist_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
+    ///     async_std::task::yield_now().await;
+    ///     println!("{:?} {elem}",std::thread::current().id())
+    /// });
+    ///```
+    fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
+    where
+        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.array().blocking_for_each_async_with_schedule(sched, self, op)
+    }
+
     /// Reduces the elements of the dist iterator using the provided closure
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
     ///
     /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
     ///
@@ -641,6 +601,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn reduce<F>(&self, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -652,7 +613,31 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
 
     /// Reduces the elements of the dist iterator using the provided closure
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// The function returns the reduced value
+    ///
+    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let req = array.dist_iter().blocking_reduce(|acc,elem| acc+elem);
+    ///```
+    fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
+    where
+        // &'static Self: LocalIterator + 'static,
+        Self::Item: Dist + ArrayOps,
+        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_reduce(self, op)
+    }
+
+    /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve the  reduced value.
     ///
     /// # Examples
     ///```
@@ -664,6 +649,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -673,6 +659,28 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().reduce_with_schedule(sched, self, op)
     }
 
+    /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// This function returns the reduced value.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let req = array.dist_iter().blocking_reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);//wait on the collect request to get the new array
+    ///```
+    fn blocking_reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> Option<Self::Item>
+    where
+        // &'static Self: LocalIterator + 'static,
+        Self::Item: Dist + ArrayOps,
+        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_reduce_with_schedule(sched, self, op)
+    }
+
     /// Collects the elements of the distributed iterator into a new LamellarArray
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -699,6 +707,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn collect<A>(&self, d: Distribution) -> DistIterCollectHandle<Self::Item, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -708,6 +717,105 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect(self, d)
     }
 
+    /// Collects the elements of the distributed iterator into a new LamellarArray
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// This function returns the new LamellarArray upon completion.
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let new_array = array.dist_iter()
+    ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
+    ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
+    ///                .blocking_collect::<AtomicArray<usize>>(Distribution::Block);
+    ///```
+    fn blocking_collect<A>(&self, d: Distribution) -> A
+    where
+        // &'static Self: DistributedIterator + 'static,
+        Self::Item: Dist + ArrayOps,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect(self, d)
+    }
+
+    /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve the new LamellarArray.
+    /// Calling await on the future will invoke an implicit barrier (allocating the resources for a new array).
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let req = array.dist_iter()
+    ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
+    ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
+    ///                .collect::<AtomicArray<usize>>(Distribution::Block);
+    /// let new_array = array.block_on(req); //wait on the collect request to get the new array
+    ///```
+    #[must_use]
+    fn collect_with_schedule<A>(&self,sched: Schedule, d: Distribution) -> DistIterCollectHandle<Self::Item, A>
+    where
+        // &'static Self: DistributedIterator + 'static,
+        Self::Item: Dist + ArrayOps,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().collect_with_schedule(sched,self,  d)
+    }
+
+    /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// This function returns the new LamellarArray upon completion.
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let new_array = array.dist_iter()
+    ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
+    ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
+    ///                .blocking_collect_with_scheduler::<AtomicArray<usize>>(Schedule::Dynamic, Distribution::Block);
+    ///```
+    fn blocking_collect_with_schedule<A>(&self,sched: Schedule, d: Distribution) -> A
+    where
+        // &'static Self: DistributedIterator + 'static,
+        Self::Item: Dist + ArrayOps,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect_with_schedule(sched,self, d)
+    }
+
     /// Collects the awaited elements of the distributed iterator into a new LamellarArray
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -738,12 +846,13 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// // run collect
     /// let req
     ///     = array_clone.dist_iter().map(
-    ///         move |elem|  
+    ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
     ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
+    #[must_use]
     fn collect_async<A, T>(&self, d: Distribution) -> DistIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -754,9 +863,146 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_async(self, d)
     }
 
-    /// Counts the number of the elements of the local iterator
+    /// Collects the awaited elements of the distributed iterator into a new LamellarArray
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// Each element from the iterator must return a Future
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// The function returns the new LamellarArray upon completion.
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// // initialize a world and an atomic array
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// // clone the array; this doesn't duplicate the underlying
+    /// // data but it does create a second pointer that we can
+    /// // discard when necessary
+    /// let array_clone = array.clone();
+    ///
+    /// // run collect
+    /// let _new_array
+    ///     = array_clone.dist_iter().map(
+    ///         move |elem|
+    ///         array_clone
+    ///             .fetch_add(elem.load(),1000))
+    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    ///```
+    fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
+    where
+        // &'static Self: DistributedIterator + 'static,
+        T: Dist + ArrayOps,
+        Self::Item: Future<Output = T> + Send + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect_async(self, d)
+    }
+
+    /// Collects the awaited elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// Each element from the iterator must return a Future
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve the new LamellarArray.
+    /// Calling await on the future will invoke an implicit barrier (allocating the resources for a new array).
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// // initialize a world and an atomic array
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// // clone the array; this doesn't duplicate the underlying
+    /// // data but it does create a second pointer that we can
+    /// // discard when necessary
+    /// let array_clone = array.clone();
+    ///
+    /// // run collect
+    /// let req
+    ///     = array_clone.dist_iter().map(
+    ///         move |elem|
+    ///         array_clone
+    ///             .fetch_add(elem.load(),1000))
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
+    /// let _new_array = array.block_on(req);
+    ///```
+    #[must_use]
+    fn collect_async_with_schedule<A, T>(&self, sched: Schedule,   d: Distribution) -> DistIterCollectHandle<T, A>
+    where
+        // &'static Self: DistributedIterator + 'static,
+        T: Dist + ArrayOps,
+        Self::Item: Future<Output = T> + Send + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().collect_async_with_schedule(sched, self, d)
+    }
+
+    /// Collects the awaited elements of the distributed iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// Each element from the iterator must return a Future
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// The function returns the new LamellarArray upon completion.
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// // initialize a world and an atomic array
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// // clone the array; this doesn't duplicate the underlying
+    /// // data but it does create a second pointer that we can
+    /// // discard when necessary
+    /// let array_clone = array.clone();
+    ///
+    /// // run collect
+    /// let _new_array
+    ///     = array_clone.dist_iter().map(
+    ///         move |elem|
+    ///         array_clone
+    ///             .fetch_add(elem.load(),1000))
+    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    ///```
+    fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
+    where
+        // &'static Self: DistributedIterator + 'static,
+        T: Dist + ArrayOps,
+        Self::Item: Future<Output = T> + Send + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect_async_with_schedule(sched,self, d)
+    }
+
+    /// Counts the number of the elements of the distriubted iterator
+    /// 
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve count.
     ///
     /// # Examples
     ///```
@@ -768,13 +1014,35 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().filter(|elem|  elem < 10).count();
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn count(&self) -> DistIterCountHandle {
         self.array().count(self)
     }
 
-    /// Counts the number of the elements of the local iterator
+    /// Counts the number of the elements of the distributed iterator
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    /// 
+    /// This function returns the count upon completion.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count();
+    ///```
+    fn blocking_count(&self) -> usize {
+        self.array().blocking_count(self)
+    }
+
+    /// Counts the number of the elements of the distriubted iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
+    /// 
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve count.
     ///
     /// # Examples
     ///```
@@ -790,9 +1058,31 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().count_with_schedule(sched, self)
     }
 
-    /// Sums the elements of the local iterator.
+
+    /// Counts the number of the elements of the distributed iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    /// 
+    /// This function returns the count upon completion.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count_with_schedule(Schedule::Dynamic);
+    ///```
+    fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
+        self.array().blocking_count_with_schedule(sched, self)
+    }
+
+    /// Sums the elements of the distributed iterator.
     ///
     /// Takes each element, adds them together, and returns the result.
+    /// 
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// An empty iterator returns the zero value of the type.
     ///
@@ -808,6 +1098,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn sum(&self) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
@@ -815,9 +1106,37 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().sum(self)
     }
 
-    /// Sums the elements of the local iterator, using the specified schedule
+    /// Sums the elements of the distributed iterator.
     ///
     /// Takes each element, adds them together, and returns the result.
+    /// 
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    ///
+    /// An empty iterator returns the zero value of the type.
+    ///
+    /// This function returns the sum upon completion.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let sum = array.dist_iter().blocking_sum();
+    ///```
+    fn blocking_sum(&self) -> Self::Item
+    where
+        Self::Item: Dist + ArrayOps + std::iter::Sum,
+    {
+        self.array().blocking_sum(self)
+    }
+
+    /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// Takes each element, adds them together, and returns the result.
+    /// 
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// An empty iterator returns the zero value of the type.
     ///
@@ -833,12 +1152,39 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn sum_with_schedule(&self, sched: Schedule) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
     {
         self.array().sum_with_schedule(sched, self)
     }
+
+    /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// Takes each element, adds them together, and returns the result.
+    /// 
+    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    ///
+    /// An empty iterator returns the zero value of the type.
+    ///
+    /// This function returns the sum upon completion.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let sum = array.dist_iter().blocking_sum_with_schedule(Schedule::Guided);
+    ///```
+    fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
+    where
+        Self::Item: Dist + ArrayOps + std::iter::Sum,
+    {
+        self.array().blocking_sum_with_schedule(sched, self)
+    }
 }
 
 /// An interface for dealing with distributed iterators which are indexable, meaning it returns an iterator of known length
diff --git a/src/array/iterator/distributed_iterator/consumer.rs b/src/array/iterator/distributed_iterator/consumer.rs
index d1438302..406017f1 100644
--- a/src/array/iterator/distributed_iterator/consumer.rs
+++ b/src/array/iterator/distributed_iterator/consumer.rs
@@ -9,268 +9,3 @@ pub(crate) use count::*;
 pub(crate) use for_each::*;
 pub(crate) use reduce::*;
 pub(crate) use sum::*;
-
-// use crate::active_messaging::LamellarArcLocalAm;
-// use crate::lamellar_request::LamellarRequest;
-// use crate::lamellar_team::LamellarTeamRT;
-// use crate::array::iterator::distributed_iterator::{DistributedIterator,IterRequest,Monotonic};
-
-// use std::sync::Arc;
-// use std::sync::atomic::{AtomicUsize,Ordering};
-// use std::pin::Pin;
-// use parking_lot::Mutex;
-// use rand::thread_rng;
-// use rand::prelude::SliceRandom;
-
-// #[derive(Clone, Debug)]
-// pub(crate) struct IterWorkStealer {
-//     pub(crate) range: Arc<Mutex<(usize, usize)>>, //start, end
-// }
-
-// impl IterWorkStealer {
-//     fn set_range(&self, start: usize, end: usize) {
-//         let mut range = self.range.lock();
-//         range.0 = start;
-//         range.1 = end;
-//     }
-
-//     fn next(&self) -> Option<usize> {
-//         let mut range = self.range.lock();
-//         let index = range.0;
-//         range.0 += 1;
-//         if range.0 <= range.1 {
-//             Some(index)
-//         } else {
-//             None
-//         }
-//     }
-//     fn set_done(&self) {
-//         let mut range = self.range.lock();
-//         range.0 = range.1;
-//     }
-
-//     fn steal(&self) -> Option<(usize, usize)> {
-//         let mut range = self.range.lock();
-//         let start = range.0;
-//         let end = range.1;
-//         if end > start && end - start > 2 {
-//             let new_end = (start + end) / 2;
-//             range.1 = new_end;
-//             Some((new_end, end))
-//         } else {
-//             None
-//         }
-//     }
-// }
-
-// #[derive(Clone, Debug)]
-// pub(crate) enum IterSchedule{
-//     Static(usize,usize),
-//     Dynamic(Arc<AtomicUsize>,usize),
-//     Chunk(Vec<(usize, usize)>, Arc<AtomicUsize>,),
-//     WorkStealing(IterWorkStealer, Vec<IterWorkStealer>)
-// }
-
-// impl IterSchedule {
-//     fn init_iter<I: DistributedIterator>(&self, iter: I) -> IterScheduleIter<I> {
-//         match self {
-//             IterSchedule::Static( start, end) => {
-//                 IterScheduleIter::Static(iter.init(*start,end-start))
-//             }
-//             IterSchedule::Dynamic(cur_i, max_i) => {
-//                 IterScheduleIter::Dynamic(iter, cur_i.clone(), *max_i)
-//             }
-//             IterSchedule::Chunk(ranges, range_i) => {
-//                 IterScheduleIter::Chunk(iter.init(0,0), ranges.clone(),range_i.clone())
-//             }
-//             IterSchedule::WorkStealing( range, siblings) => {
-//                 let (start, end) = *range.range.lock();
-//                 IterScheduleIter::WorkStealing(iter.init(start, end-start), range.clone(), siblings.clone())
-//             }
-//         }
-//     }
-//     fn monotonic_iter<I: DistributedIterator>(&self, iter: I) -> IterScheduleIter<Monotonic<I>> {
-//         match self {
-//             IterSchedule::Static(start, end) => {
-//                 IterScheduleIter::Static(iter.monotonic().init(*start,end-start))
-//             }
-//             IterSchedule::Dynamic(cur_i, max_i) => {
-//                 IterScheduleIter::Dynamic(iter.monotonic(), cur_i.clone(), *max_i)
-//             }
-//             IterSchedule::Chunk(ranges, range_i) => {
-//                 IterScheduleIter::Chunk(iter.monotonic().init(0,0), ranges.clone(),range_i.clone())
-//             }
-//             IterSchedule::WorkStealing(range, siblings) => {
-//                 let (start, end) = *range.range.lock();
-//                 IterScheduleIter::WorkStealing(iter.monotonic().init(start, end-start), range.clone(), siblings.clone())            }
-//         }
-//     }
-// }
-
-// pub(crate) enum IterScheduleIter<I>{
-//     Static(I),
-//     Dynamic(I,Arc<AtomicUsize>,usize),
-//     Chunk(I,Vec<(usize, usize)>, Arc<AtomicUsize>),
-//     WorkStealing(I,IterWorkStealer, Vec<IterWorkStealer>)
-// }
-
-// impl<I: LocalIterator> Iterator for IterScheduleIter<I> {
-//     type Item = I::Item;
-//     fn next(&mut self) -> Option<Self::Item> {
-//         match self {
-//             IterScheduleIter::Static(iter) => {
-//                 iter.next()
-//             }
-//             IterScheduleIter::Dynamic(iter, cur_i, max_i) => {
-//                 let mut ci = cur_i.fetch_add(1, Ordering::Relaxed);
-//                 while ci < *max_i {
-//                     // println!("ci {:?} maxi {:?} {:?}", ci, *max_i, std::thread::current().id());
-//                     *iter = iter.init(ci,1);
-//                     if let Some(elem) = iter.next() {
-//                         return Some(elem);
-//                     }
-//                     ci = cur_i.fetch_add(1, Ordering::Relaxed);
-//                 }
-//                 None
-//             }
-//             IterScheduleIter::Chunk(iter, ranges, range_i) => {
-//                 let mut next = iter.next();
-//                 // println!("next {:?} {:?}", next.is_none(), std::thread::current().id());
-//                 if next.is_none(){
-//                     let ri = range_i.fetch_add(1, Ordering::Relaxed);
-//                     // println!("range {:?} {:?}", ri, std::thread::current().id());
-//                     if ri < ranges.len() {
-//                         *iter = iter.init(ranges[ri].0, ranges[ri].1-ranges[ri].0);
-//                         next = iter.next();
-//                     }
-//                 }
-//                 next
-//             }
-//             IterScheduleIter::WorkStealing(iter, range, siblings) => {
-//                 let mut inner_next = |iter: &mut I| {
-//                     while let Some(ri) = range.next(){
-//                         *iter = iter.init(ri,1);
-//                         if let Some(elem) = iter.next() {
-//                             return Some(elem);
-//                         }
-//                         // else{
-//                         //     range.set_done();
-//                         // }
-//                     }
-//                     None
-//                 };
-//                 let mut next = inner_next(iter);
-//                 if next.is_none() {
-//                     let mut rng = thread_rng();
-//                     let mut workers = (0..siblings.len()).collect::<Vec<usize>>();
-//                     workers.shuffle(&mut rng);
-//                     if let Some(worker) = workers.pop() {
-//                         if let Some((start, end)) = siblings[worker].steal() {
-//                             *iter = iter.init(start, end - start);
-//                             range.set_range(start, end);
-//                             next = inner_next(iter);
-//                         }
-//                     }
-//                 }
-//                 next
-//             }
-//         }
-//     }
-// }
-
-// pub(crate) trait IterConsumer{
-//     type AmOutput;
-//     type Output;
-//     fn into_am(&self, schedule: IterSchedule) -> LamellarArcLocalAm;
-//     fn create_handle(self, team: Pin<Arc<LamellarTeamRT>>, reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>) -> Box<dyn IterRequest<Output = Self::Output>>;
-//     fn max_elems(&self, in_elems: usize) -> usize;
-// }
-
-// // #[derive(Clone, Debug)]
-// // pub(crate) enum IterConsumer<I,A,T,F,R>{
-// //     Collect(Distribution,PhantomData<A>),
-// //     Count,
-// //     ForEach(F),
-// //     Reduce(R),
-// // }
-
-// // impl<I,A,T,F,R> IterConsumer<I,A,T,F,R> where
-// //     I: LocalIterator + 'static,
-// //     I::Item: SyncSend,
-// //     A: From<UnsafeArray<T>> + SyncSend,
-// //     T: Dist + ArrayOps
-// //     F: Fn(I::Item) + SyncSend + Clone + 'static,
-// //     R: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,{
-
-// //     fn into_am<Am>(self, schedule: IterSchedule<I>) -> Am
-// //     where
-// //         A: LamellarActiveMessage + LocalAM + 'static,{
-// //         match self {
-// //             IterConsumer::Collect(_) => {
-// //                 CollectAm{
-// //                     schedule
-// //                 }
-// //             }
-// //             IterConsumer::Count => {
-// //                 CountAm{
-// //                     schedule
-// //                 }
-// //             }
-// //             IterConsumer::ForEach(op) => {
-// //                 ForEachAm{
-// //                     op,
-// //                     schedule,
-// //                 }
-// //             }
-// //             IterConsumer::Reduce(op) => {
-// //                 ReduceAm{
-// //                     op,
-// //                     schedule,
-// //                 }
-// //             }
-// //         }
-// //     }
-
-// //     fn create_handle<O>(self, team: Pin<Arc<LamellarTeamRT>>, reqs: Vec<Box<dyn LamellarRequest<Output = O>>) -> IterConsumerHandle<A,T,F,R>{
-// //         match self {
-// //             IterConsumer::Collect(dist,phantom) => {
-// //                 IterConsumerHandle::Collect(LocalIterCollectHandle{
-// //                     reqs: reqs,
-// //                     distribution: dist,
-// //                     team: team,
-// //                     _phantom: phantom,
-// //                 })
-// //             }
-// //             IterConsumer::Count => {
-// //                 IterConsumerHandle::Count(LocalIterCountHandle{
-// //                     reqs: reqs,
-// //                 })
-// //             }
-// //             IterConsumer::ForEach(_) => {
-// //                 IterConsumerHandle::ForEach(LocalIterForEachHandle{
-// //                     reqs: reqs,
-// //                 })
-// //             }
-// //             IterConsumer::Reduce(op) => {
-// //                 IterConsumerHandle::Reduce(LocalIterReduceHandle::<I::Item,R>{
-// //                     reqs:reqs,
-// //                     op: op
-// //                 })
-// //             }
-// //         }
-// //     }
-// // }
-
-// // pub(crate) enum IterConsumerHandle<A,T,F>{
-// //     Collect(LocalIterCollectHandle<T,A>),
-// //     Count(LocalIterCountHandle),
-// //     ForEach(LocalIterForEachHandle),
-// //     Reduce(LocalIterReduceHandle<T,F>)
-// // }
-
-// // #[async_trait]
-// // impl<A,T,F> IterConsumerHandle<I,A,T,F,R> where
-// //     A: From<UnsafeArray<T>> + SyncSend,
-// //     T: Dist + ArrayOps
-// //     F: Fn(I::Item) + SyncSend + Clone + 'static,
-// //     R: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,{
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index ece9e23e..203a93fd 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -3,7 +3,9 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
 use crate::array::iterator::private::*;
 use crate::array::operations::ArrayOps;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
+use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
@@ -43,7 +45,7 @@ where
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
     type Item = (usize, I::Item);
-    type Handle = DistIterCollectHandle<I::Item, A>;
+    type Handle = InnerDistIterCollectHandle<I::Item, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Collect {
             iter: self.iter.init(start, cnt),
@@ -65,11 +67,11 @@ where
         team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        DistIterCollectHandle {
+        InnerDistIterCollectHandle {
             reqs,
             distribution: self.distribution,
             team,
-            state: State::ReqsPending(Vec::new()),
+            state: InnerState::ReqsPending(Vec::new()),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -104,7 +106,7 @@ where
     type AmOutput = Vec<(usize, B)>;
     type Output = A;
     type Item = (usize, I::Item);
-    type Handle = DistIterCollectHandle<B, A>;
+    type Handle = InnerDistIterCollectHandle<B, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         CollectAsync {
             iter: self.iter.init(start, cnt),
@@ -126,11 +128,11 @@ where
         team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        DistIterCollectHandle {
+        InnerDistIterCollectHandle {
             reqs,
             distribution: self.distribution,
             team,
-            state: State::ReqsPending(Vec::new()),
+            state: InnerState::ReqsPending(Vec::new()),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -155,24 +157,36 @@ where
 }
 
 //#[doc(hidden)]
+// #[pin_project]
+// pub struct InnerDistIterCollectHandle<
+//     T: Dist + ArrayOps,
+//     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
+// > {
+//     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
+//     pub(crate) distribution: Distribution,
+//     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+//     state: InnerState<T, A>,
+// }
+// enum InnerState<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> {
+//     ReqsPending(Vec<(usize, T)>),
+//     Collecting(Pin<Box<dyn Future<Output = A>>>),
+// }
+
 #[pin_project]
-pub struct DistIterCollectHandle<
-    T: Dist + ArrayOps,
-    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
-> {
+pub(crate) struct InnerDistIterCollectHandle<T, A> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
     pub(crate) distribution: Distribution,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    state: State<T, A>,
+    state: InnerState<T, A>,
 }
 
-enum State<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> {
+enum InnerState<T, A> {
     ReqsPending(Vec<(usize, T)>),
-    Collecting(Pin<Box<dyn Future<Output = A>>>),
+    Collecting(Pin<Box<dyn Future<Output = A> + Send>>),
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
-    DistIterCollectHandle<T, A>
+    InnerDistIterCollectHandle<T, A>
 {
     async fn async_create_array(
         local_vals: Vec<T>,
@@ -192,13 +206,13 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static> Future
-    for DistIterCollectHandle<T, A>
+    for InnerDistIterCollectHandle<T, A>
 {
     type Output = A;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(ref mut vals) => {
+            InnerState::ReqsPending(ref mut vals) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if req.ready_or_set_waker(cx.waker()) {
                         vals.extend(req.val());
@@ -221,12 +235,12 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
                         return Poll::Ready(a);
                     }
                     Poll::Pending => {
-                        *this.state = State::Collecting(collect);
+                        *this.state = InnerState::Collecting(collect);
                         return Poll::Pending;
                     }
                 }
             }
-            State::Collecting(collect) => {
+            InnerState::Collecting(collect) => {
                 let a = ready!(Future::poll(collect.as_mut(), cx));
                 Poll::Ready(a)
             }
@@ -235,7 +249,7 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
-    LamellarRequest for DistIterCollectHandle<T, A>
+    LamellarRequest for InnerDistIterCollectHandle<T, A>
 {
     fn blocking_wait(mut self) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -270,6 +284,103 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
     }
 }
 
+#[pin_project]
+pub struct DistIterCollectHandle<T, A> {
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State<T, A>,
+}
+
+impl<T, A> DistIterCollectHandle<T, A>
+where
+    T: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
+{
+    pub(crate) fn new(
+        barrier_handle: BarrierHandle,
+        inner: Pin<Box<dyn Future<Output = InnerDistIterCollectHandle<T, A>> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Barrier(barrier_handle, inner),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State<T, A> {
+    Barrier(
+        #[pin] BarrierHandle,
+        Pin<Box<dyn Future<Output = InnerDistIterCollectHandle<T, A>> + Send>>,
+    ),
+    Reqs(#[pin] InnerDistIterCollectHandle<T, A>),
+}
+impl<T, A> Future for DistIterCollectHandle<T, A>
+where
+    T: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
+{
+    type Output = A;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Barrier(barrier, inner) => {
+                ready!(barrier.poll(cx));
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl<T, A> LamellarRequest for DistIterCollectHandle<T, A>
+where
+    T: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
+{
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Barrier(barrier, reqs) => {
+                barrier.blocking_wait();
+                self.team.block_on(reqs).blocking_wait()
+            }
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Barrier(barrier, _) => {
+                if !barrier.ready_or_set_waker(waker) {
+                    return false;
+                }
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Barrier(_barrier, _reqs) => {
+                unreachable!("should never be in barrier state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct CollectAm<I, A> {
     pub(crate) iter: Collect<I, A>,
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 021df215..7d0a3a78 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -3,6 +3,8 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::private::*;
 
+use crate::array::r#unsafe::private::UnsafeArrayInner;
+use crate::barrier::BarrierHandle;
 use crate::darc::DarcMode;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
@@ -40,7 +42,7 @@ where
     type AmOutput = usize;
     type Output = usize;
     type Item = I::Item;
-    type Handle = DistIterCountHandle;
+    type Handle = InnerDistIterCountHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Count {
             iter: self.iter.init(start, cnt),
@@ -60,10 +62,10 @@ where
         team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        DistIterCountHandle {
+        InnerDistIterCountHandle {
             reqs,
             team,
-            state: State::ReqsPending(0),
+            state: InnerState::ReqsPending(0),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -73,15 +75,15 @@ where
 
 //#[doc(hidden)]
 #[pin_project]
-pub struct DistIterCountHandle {
+pub(crate) struct InnerDistIterCountHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
     team: Pin<Arc<LamellarTeamRT>>,
-    state: State,
+    state: InnerState,
 }
 
-enum State {
+enum InnerState {
     ReqsPending(usize),
-    Counting(Pin<Box<dyn Future<Output = usize>>>),
+    Counting(Pin<Box<dyn Future<Output = usize> + Send>>),
 }
 
 #[lamellar_impl::AmDataRT]
@@ -97,7 +99,7 @@ impl LamellarAm for UpdateCntAm {
     }
 }
 
-impl DistIterCountHandle {
+impl InnerDistIterCountHandle {
     async fn async_reduce_remote_counts(local_cnt: usize, team: Pin<Arc<LamellarTeamRT>>) -> usize {
         let cnt = Darc::async_try_new(&team, AtomicUsize::new(0), DarcMode::Darc)
             .await
@@ -122,12 +124,12 @@ impl DistIterCountHandle {
     }
 }
 
-impl Future for DistIterCountHandle {
+impl Future for InnerDistIterCountHandle {
     type Output = usize;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(cnt) => {
+            InnerState::ReqsPending(cnt) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if !req.ready_or_set_waker(cx.waker()) {
                         this.reqs.push_front(req);
@@ -142,12 +144,12 @@ impl Future for DistIterCountHandle {
                         return Poll::Ready(count);
                     }
                     Poll::Pending => {
-                        *this.state = State::Counting(global_cnt);
+                        *this.state = InnerState::Counting(global_cnt);
                         Poll::Pending
                     }
                 }
             }
-            State::Counting(global_cnt) => {
+            InnerState::Counting(global_cnt) => {
                 let count = ready!(Future::poll(global_cnt.as_mut(), cx));
                 Poll::Ready(count)
             }
@@ -157,7 +159,7 @@ impl Future for DistIterCountHandle {
 
 //#[doc(hidden)]
 #[async_trait]
-impl LamellarRequest for DistIterCountHandle {
+impl LamellarRequest for InnerDistIterCountHandle {
     fn blocking_wait(mut self) -> Self::Output {
         self.team.tasking_barrier();
         let cnt = Darc::new(&self.team, AtomicUsize::new(0)).unwrap();
@@ -191,6 +193,91 @@ impl LamellarRequest for DistIterCountHandle {
     }
 }
 
+#[pin_project]
+pub struct DistIterCountHandle {
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State,
+}
+
+impl DistIterCountHandle {
+    pub(crate) fn new(
+        barrier_handle: BarrierHandle,
+        inner: Pin<Box<dyn Future<Output = InnerDistIterCountHandle> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Barrier(barrier_handle, inner),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State {
+    Barrier(
+        #[pin] BarrierHandle,
+        Pin<Box<dyn Future<Output = InnerDistIterCountHandle> + Send>>,
+    ),
+    Reqs(#[pin] InnerDistIterCountHandle),
+}
+impl Future for DistIterCountHandle {
+    type Output = usize;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Barrier(barrier, inner) => {
+                ready!(barrier.poll(cx));
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl LamellarRequest for DistIterCountHandle {
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Barrier(barrier, reqs) => {
+                barrier.blocking_wait();
+                self.team.block_on(reqs).blocking_wait()
+            }
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Barrier(barrier, _) => {
+                if !barrier.ready_or_set_waker(waker) {
+                    return false;
+                }
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Barrier(_barrier, _reqs) => {
+                unreachable!("should never be in barrier state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct CountAm<I> {
     pub(crate) iter: Count<I>,
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index dd202970..955c88fc 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -2,11 +2,14 @@ use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::private::*;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
+use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use futures_util::Future;
+use futures_util::{ready, Future};
+use pin_project::pin_project;
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -43,7 +46,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
-    type Handle = DistIterForEachHandle;
+    type Handle = InnerDistIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEach {
             iter: self.iter.init(start, cnt),
@@ -65,7 +68,7 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        DistIterForEachHandle { reqs }
+        InnerDistIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -107,7 +110,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
-    type Handle = DistIterForEachHandle;
+    type Handle = InnerDistIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEachAsync {
             iter: self.iter.init(start, cnt),
@@ -130,14 +133,11 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        DistIterForEachHandle { reqs }
+        InnerDistIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
     }
-    // fn clone(&self) -> Self{
-
-    // }
 }
 
 impl<I, F, Fut> Clone for ForEachAsync<I, F, Fut>
@@ -154,12 +154,11 @@ where
     }
 }
 
-//#[doc(hidden)]
-pub struct DistIterForEachHandle {
+pub(crate) struct InnerDistIterForEachHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
 }
 
-impl Future for DistIterForEachHandle {
+impl Future for InnerDistIterForEachHandle {
     type Output = ();
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         while let Some(mut req) = self.reqs.pop_front() {
@@ -173,7 +172,7 @@ impl Future for DistIterForEachHandle {
 }
 
 //#[doc(hidden)]
-impl LamellarRequest for DistIterForEachHandle {
+impl LamellarRequest for InnerDistIterForEachHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(..) {
             req.blocking_wait();
@@ -195,6 +194,95 @@ impl LamellarRequest for DistIterForEachHandle {
     }
 }
 
+//#[doc(hidden)]
+#[pin_project]
+pub struct DistIterForEachHandle {
+    // pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State,
+}
+
+impl DistIterForEachHandle {
+    pub(crate) fn new(
+        barrier: BarrierHandle,
+        reqs: Pin<Box<dyn Future<Output = InnerDistIterForEachHandle> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        DistIterForEachHandle {
+            team: array.data.team.clone(),
+            state: State::Barrier(barrier, reqs),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State {
+    Barrier(
+        #[pin] BarrierHandle,
+        Pin<Box<dyn Future<Output = InnerDistIterForEachHandle> + Send>>,
+    ),
+    Reqs(#[pin] InnerDistIterForEachHandle),
+}
+impl Future for DistIterForEachHandle {
+    type Output = ();
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Barrier(barrier, inner) => {
+                ready!(barrier.poll(cx));
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(()) => Poll::Ready(()),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                ready!(inner.poll(cx));
+                Poll::Ready(())
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl LamellarRequest for DistIterForEachHandle {
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Barrier(barrier, reqs) => {
+                barrier.blocking_wait();
+                self.team.block_on(reqs).blocking_wait();
+            }
+            State::Reqs(inner) => {
+                inner.blocking_wait();
+            }
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Barrier(barrier, _) => {
+                if !barrier.ready_or_set_waker(waker) {
+                    return false;
+                }
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Barrier(_barrier, _reqs) => {
+                unreachable!("should never be in barrier state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct ForEachAm<I, F>
 where
@@ -277,320 +365,3 @@ where
         }
     }
 }
-
-// #[lamellar_impl::AmLocalDataRT(Clone)]
-// pub(crate) struct ForEachStatic<I, F>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) start_i: usize,
-//     pub(crate) end_i: usize,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachStatic<I, F>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         let mut iter = self.data.init(self.start_i, self.end_i - self.start_i);
-//         // println!("for each static thread {:?} {} {} {}",std::thread::current().id(),self.start_i, self.end_i, self.end_i - self.start_i);
-//         // let mut cnt = 0;
-//         while let Some(elem) = iter.next() {
-//             (&self.op)(elem);
-//             // cnt += 1;
-//         }
-//         // println!("thread {:?} elems processed {:?}",std::thread::current().id(), cnt);
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachDynamic<I, F>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) cur_i: Arc<AtomicUsize>,
-//     pub(crate) max_i: usize,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachDynamic<I, F>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-
-//         while cur_i < self.max_i {
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(cur_i, 1);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item);
-//             }
-//             cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachChunk<I, F>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) ranges: Vec<(usize, usize)>,
-//     pub(crate) range_i: Arc<AtomicUsize>,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachChunk<I, F>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         while range_i < self.ranges.len() {
-//             let (start_i, end_i) = self.ranges[range_i];
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(start_i, end_i - start_i);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item);
-//             }
-//             range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachWorkStealing<I, F>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) range: IterWorkStealer,
-//     // pub(crate) ranges: Vec<(usize, usize)>,
-//     // pub(crate) range_i: Arc<AtomicUsize>,
-//     pub(crate) siblings: Vec<IterWorkStealer>,
-// }
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachWorkStealing<I, F>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         let (start, end) = *self.range.range.lock();
-//         // println!("{:?} ForEachWorkStealing {:?} {:?}",std::thread::current().id(), start, end);
-//         let mut iter = self.data.init(start, end - start);
-//         while self.range.next().is_some() {
-//             if let Some(elem) = iter.next() {
-//                 (&self.op)(elem);
-//             } else {
-//                 self.range.set_done();
-//             }
-//         }
-//         // println!("{:?} ForEachWorkStealing done with my range",std::thread::current().id());
-//         let mut rng = thread_rng();
-//         let mut workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//         workers.shuffle(&mut rng);
-//         while let Some(worker) = workers.pop() {
-//             // println!("{:?} ForEachWorkStealing stealing from sibling",std::thread::current().id());
-//             if let Some((start, end)) = self.siblings[worker].steal() {
-//                 let mut iter = self.data.init(start, end - start);
-//                 self.range.set_range(start, end);
-//                 while self.range.next().is_some() {
-//                     if let Some(elem) = iter.next() {
-//                         (&self.op)(elem);
-//                     } else {
-//                         self.range.set_done();
-//                     }
-//                 }
-//                 workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//                 workers.shuffle(&mut rng);
-//             }
-//         }
-//         // println!("{:?} ForEachWorkStealing done",std::thread::current().id());
-//     }
-// }
-
-//-------------------------async for each-------------------------------
-
-// #[lamellar_impl::AmLocalDataRT(Clone)]
-// pub(crate) struct ForEachAsyncStatic<I, F, Fut>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) start_i: usize,
-//     pub(crate) end_i: usize,
-// }
-
-// impl<I, F, Fut> std::fmt::Debug for ForEachAsyncStatic<I, F, Fut>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//         write!(
-//             f,
-//             "ForEachAsync {{   start_i: {:?}, end_i: {:?} }}",
-//             self.start_i, self.end_i
-//         )
-//     }
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncStatic<I, F, Fut>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         let mut iter = self.data.init(self.start_i, self.end_i - self.start_i);
-//         while let Some(elem) = iter.next() {
-//             (&self.op)(elem).await;
-//         }
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachAsyncDynamic<I, F, Fut>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) cur_i: Arc<AtomicUsize>,
-//     pub(crate) max_i: usize,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncDynamic<I, F, Fut>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-
-//         while cur_i < self.max_i {
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(cur_i, 1);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item).await;
-//             }
-//             cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachAsyncChunk<I, F, Fut>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) ranges: Vec<(usize, usize)>,
-//     pub(crate) range_i: Arc<AtomicUsize>,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncChunk<I, F, Fut>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         while range_i < self.ranges.len() {
-//             let (start_i, end_i) = self.ranges[range_i];
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(start_i, end_i - start_i);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item).await;
-//             }
-//             range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachAsyncWorkStealing<I, F, Fut>
-// where
-//     I: DistributedIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) range: IterWorkStealer,
-//     pub(crate) siblings: Vec<IterWorkStealer>,
-// }
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncWorkStealing<I, F, Fut>
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let (start, end) = *self.range.range.lock();
-//         let mut iter = self.data.init(start, end - start);
-//         while self.range.next().is_some() {
-//             if let Some(elem) = iter.next() {
-//                 (&self.op)(elem);
-//             }
-//         }
-//         // let mut rng = thread_rng().gen();
-//         let mut workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//         workers.shuffle(&mut thread_rng());
-//         while let Some(worker) = workers.pop() {
-//             if let Some((start, end)) = self.siblings[worker].steal() {
-//                 let mut iter = self.data.init(start, end - start);
-//                 self.range.set_range(start, end);
-//                 while self.range.next().is_some() {
-//                     if let Some(elem) = iter.next() {
-//                         (&self.op)(elem).await;
-//                     }
-//                 }
-//                 workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//                 workers.shuffle(&mut thread_rng());
-//             }
-//         }
-//         // println!("done in for each");
-//     }
-// }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 7715c79f..81650db0 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -3,7 +3,9 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::private::*;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{ArrayOps, Distribution, UnsafeArray};
+use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
@@ -12,6 +14,7 @@ use crate::Dist;
 use futures_util::{ready, Future, StreamExt};
 use pin_project::pin_project;
 use std::collections::VecDeque;
+use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
@@ -40,7 +43,7 @@ where
     type AmOutput = Option<I::Item>;
     type Output = Option<I::Item>;
     type Item = I::Item;
-    type Handle = DistIterReduceHandle<I::Item, F>;
+    type Handle = InnerDistIterReduceHandle<I::Item, F>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Reduce {
             iter: self.iter.init(start, cnt),
@@ -62,11 +65,11 @@ where
         team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        DistIterReduceHandle {
+        InnerDistIterReduceHandle {
             op: self.op,
             reqs,
             team,
-            state: State::ReqsPending(None),
+            state: InnerState::ReqsPending(None),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -74,21 +77,108 @@ where
     }
 }
 
+// #[derive(Debug)]
+// pub(crate) struct ReduceAsync<I, T, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Future<Output = T> + Send + 'static,
+//     T: Dist + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     pub(crate) iter: I,
+//     pub(crate) op: F,
+//     // pub(crate) _phantom: PhantomData<T>,
+// }
+
+// impl<I, T, F> IterClone for ReduceAsync<I, T, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Future<Output = T> + SyncSend + 'static,
+//     T: Dist + Send + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     fn iter_clone(&self, _: Sealed) -> Self {
+//         ReduceAsync {
+//             iter: self.iter.iter_clone(Sealed),
+//             op: self.op.clone(),
+//         }
+//     }
+// }
+
+// impl<I, T, F> IterConsumer for ReduceAsync<I, T, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Future<Output = T> + Send + 'static,
+//     T: Dist + Send + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     type AmOutput = Option<T>;
+//     type Output = Option<T>;
+//     type Item = I::Item;
+//     type Handle = InnerDistIterReduceHandle<I::Item, F>;
+//     fn init(&self, start: usize, cnt: usize) -> Self {
+//         ReduceAsync {
+//             iter: self.iter.init(start, cnt),
+//             op: self.op.clone(),
+//         }
+//     }
+//     fn next(&mut self) -> Option<Self::Item> {
+//         self.iter.next()
+//     }
+//     fn into_am(&self, schedule: IterSchedule) -> LamellarArcLocalAm {
+//         Arc::new(ReduceAsyncAm {
+//             iter: self.iter_clone(Sealed),
+//             op: self.op.clone(),
+//             schedule,
+//         })
+//     }
+//     fn create_handle(
+//         self,
+//         team: Pin<Arc<LamellarTeamRT>>,
+//         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+//     ) -> Self::Handle {
+//         InnerDistIterReduceHandle {
+//             op: self.op,
+//             reqs,
+//             team,
+//             state: InnerState::ReqsPending(None),
+//         }
+//     }
+//     fn max_elems(&self, in_elems: usize) -> usize {
+//         self.iter.elems(in_elems)
+//     }
+// }
+
+// impl<I, T, F> Clone for ReduceAsync<I, T, F>
+// where
+//     I: DistributedIterator + Clone,
+//     I::Item: Future<Output = T> + Send + 'static,
+//     T: Dist + Send + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     fn clone(&self) -> Self {
+//         ReduceAsync {
+//             iter: self.iter.clone(),
+//             op: self.op.clone(),
+//         }
+//     }
+// }
+
 //#[doc(hidden)]
 #[pin_project]
-pub struct DistIterReduceHandle<T, F> {
+pub(crate) struct InnerDistIterReduceHandle<T, F> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Option<T>>>,
     pub(crate) op: F,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    state: State<T>,
+    state: InnerState<T>,
 }
 
-enum State<T> {
+enum InnerState<T> {
     ReqsPending(Option<T>),
-    Reducing(Pin<Box<dyn Future<Output = Option<T>>>>),
+    Reducing(Pin<Box<dyn Future<Output = Option<T>> + Send + 'static>>),
 }
 
-impl<T, F> DistIterReduceHandle<T, F>
+impl<T, F> InnerDistIterReduceHandle<T, F>
 where
     T: Dist + ArrayOps,
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
@@ -133,7 +223,7 @@ where
     }
 }
 
-impl<T, F> Future for DistIterReduceHandle<T, F>
+impl<T, F> Future for InnerDistIterReduceHandle<T, F>
 where
     T: Dist + ArrayOps,
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
@@ -142,7 +232,7 @@ where
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(mut val) => {
+            InnerState::ReqsPending(mut val) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if !req.ready_or_set_waker(cx.waker()) {
                         this.reqs.push_front(req);
@@ -166,7 +256,7 @@ where
                     match Future::poll(reducing.as_mut(), cx) {
                         Poll::Ready(val) => Poll::Ready(val),
                         Poll::Pending => {
-                            *this.state = State::Reducing(reducing);
+                            *this.state = InnerState::Reducing(reducing);
                             Poll::Pending
                         }
                     }
@@ -174,7 +264,7 @@ where
                     Poll::Ready(None)
                 }
             }
-            State::Reducing(reducing) => {
+            InnerState::Reducing(reducing) => {
                 let val = ready!(Future::poll(reducing.as_mut(), cx));
                 Poll::Ready(val)
             }
@@ -183,7 +273,7 @@ where
 }
 
 //#[doc(hidden)]
-impl<T, F> LamellarRequest for DistIterReduceHandle<T, F>
+impl<T, F> LamellarRequest for InnerDistIterReduceHandle<T, F>
 where
     T: Dist + ArrayOps,
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
@@ -224,6 +314,104 @@ where
     }
 }
 
+#[pin_project]
+pub struct DistIterReduceHandle<T, F> {
+    // pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State<T, F>,
+}
+
+impl<T, F> DistIterReduceHandle<T, F>
+// where
+//     T: Dist + Send + ArrayOps,
+//     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
+    pub(crate) fn new(
+        barrier: BarrierHandle,
+        reqs: Pin<Box<dyn Future<Output = InnerDistIterReduceHandle<T, F>> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Barrier(barrier, reqs),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State<T, F> {
+    Barrier(
+        #[pin] BarrierHandle,
+        Pin<Box<dyn Future<Output = InnerDistIterReduceHandle<T, F>> + Send>>,
+    ),
+    Reqs(#[pin] InnerDistIterReduceHandle<T, F>),
+}
+impl<T, F> Future for DistIterReduceHandle<T, F>
+where
+    T: Dist + ArrayOps,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
+    type Output = Option<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Barrier(barrier, inner) => {
+                ready!(barrier.poll(cx));
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl<T, F> LamellarRequest for DistIterReduceHandle<T, F>
+where
+    T: Dist + ArrayOps,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Barrier(barrier, reqs) => {
+                barrier.blocking_wait();
+                self.team.block_on(reqs).blocking_wait()
+            }
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Barrier(barrier, _) => {
+                if !barrier.ready_or_set_waker(waker) {
+                    return false;
+                }
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Barrier(_barrier, _reqs) => {
+                unreachable!("should never be in barrier state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct ReduceAm<I, F> {
     pub(crate) op: F,
@@ -261,3 +449,54 @@ where
         }
     }
 }
+
+// #[lamellar_impl::AmLocalDataRT(Clone)]
+// pub(crate) struct ReduceAsyncAm<I, T, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Future<Output = T> + Send + 'static,
+//     T: Dist + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     pub(crate) op: F,
+//     pub(crate) iter: ReduceAsync<I, T, F>,
+//     pub(crate) schedule: IterSchedule,
+// }
+
+// impl<I: IterClone, T, F: Clone> IterClone for ReduceAsyncAm<I, T, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Future<Output = T> + Send + 'static,
+//     T: Dist + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     fn iter_clone(&self, _: Sealed) -> Self {
+//         ReduceAsyncAm {
+//             op: self.op.clone(),
+//             iter: self.iter.iter_clone(Sealed),
+//             schedule: self.schedule.clone(),
+//         }
+//     }
+// }
+
+// #[lamellar_impl::rt_am_local]
+// impl<I, T, F> LamellarAm for ReduceAsyncAm<I, T, F>
+// where
+//     I: DistributedIterator + 'static,
+//     I::Item: Future<Output = T> + Send + 'static,
+//     T: Dist + Send + ArrayOps,
+//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+// {
+//     async fn exec(&self) -> Option<T> {
+//         let mut iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
+//         match iter.next() {
+//             Some(mut accum) => {
+//                 while let Some(elem) = iter.next() {
+//                     accum = (self.op)(accum, elem);
+//                 }
+//                 Some(accum)
+//             }
+//             None => None,
+//         }
+//     }
+// }
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index affe38e2..0c39ca4b 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -2,7 +2,9 @@ use crate::active_messaging::LamellarArcLocalAm;
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::private::*;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{ArrayOps, Distribution, UnsafeArray};
+use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
@@ -35,7 +37,7 @@ where
     type AmOutput = I::Item;
     type Output = I::Item;
     type Item = I::Item;
-    type Handle = DistIterSumHandle<I::Item>;
+    type Handle = InnerDistIterSumHandle<I::Item>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Sum {
             iter: self.iter.init(start, cnt),
@@ -55,10 +57,10 @@ where
         team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        DistIterSumHandle {
+        InnerDistIterSumHandle {
             reqs,
             team,
-            state: State::ReqsPending(None),
+            state: InnerState::ReqsPending(None),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -68,18 +70,18 @@ where
 
 //#[doc(hidden)]
 #[pin_project]
-pub struct DistIterSumHandle<T> {
+pub(crate) struct InnerDistIterSumHandle<T> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    state: State<T>,
+    state: InnerState<T>,
 }
 
-enum State<T> {
+enum InnerState<T> {
     ReqsPending(Option<T>),
-    Summing(Pin<Box<dyn Future<Output = T>>>),
+    Summing(Pin<Box<dyn Future<Output = T> + Send>>),
 }
 
-impl<T> DistIterSumHandle<T>
+impl<T> InnerDistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
 {
@@ -116,7 +118,7 @@ where
     }
 }
 
-impl<T> Future for DistIterSumHandle<T>
+impl<T> Future for InnerDistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
 {
@@ -124,7 +126,7 @@ where
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(local_sum) => {
+            InnerState::ReqsPending(local_sum) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if !req.ready_or_set_waker(cx.waker()) {
                         this.reqs.push_front(req);
@@ -146,12 +148,12 @@ where
                 match Future::poll(sum.as_mut(), cx) {
                     Poll::Ready(local_sum) => Poll::Ready(local_sum),
                     Poll::Pending => {
-                        *this.state = State::Summing(sum);
+                        *this.state = InnerState::Summing(sum);
                         Poll::Pending
                     }
                 }
             }
-            State::Summing(sum) => {
+            InnerState::Summing(sum) => {
                 let local_sum = ready!(Future::poll(sum.as_mut(), cx));
                 Poll::Ready(local_sum)
             }
@@ -159,7 +161,7 @@ where
     }
 }
 //#[doc(hidden)]
-impl<T> LamellarRequest for DistIterSumHandle<T>
+impl<T> LamellarRequest for InnerDistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
 {
@@ -191,6 +193,100 @@ where
     }
 }
 
+#[pin_project]
+pub struct DistIterSumHandle<T> {
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State<T>,
+}
+
+impl<T> DistIterSumHandle<T>
+where
+    T: Dist + ArrayOps + std::iter::Sum,
+{
+    pub(crate) fn new(
+        barrier_handle: BarrierHandle,
+        inner: Pin<Box<dyn Future<Output = InnerDistIterSumHandle<T>> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Barrier(barrier_handle, inner),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State<T> {
+    Barrier(
+        #[pin] BarrierHandle,
+        Pin<Box<dyn Future<Output = InnerDistIterSumHandle<T>> + Send>>,
+    ),
+    Reqs(#[pin] InnerDistIterSumHandle<T>),
+}
+impl<T> Future for DistIterSumHandle<T>
+where
+    T: Dist + ArrayOps + std::iter::Sum,
+{
+    type Output = T;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Barrier(barrier, inner) => {
+                ready!(barrier.poll(cx));
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl<T> LamellarRequest for DistIterSumHandle<T>
+where
+    T: Dist + ArrayOps + std::iter::Sum,
+{
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Barrier(barrier, reqs) => {
+                barrier.blocking_wait();
+                self.team.block_on(reqs).blocking_wait()
+            }
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Barrier(barrier, _) => {
+                if !barrier.ready_or_set_waker(waker) {
+                    return false;
+                }
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Barrier(_barrier, _reqs) => {
+                unreachable!("should never be in barrier state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct SumAm<I> {
     pub(crate) iter: Sum<I>,
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index cf389e92..d8dfabe8 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -35,7 +35,9 @@ use zip::*;
 pub(crate) use consumer::*;
 
 use crate::array::iterator::{private::*, Schedule};
-use crate::array::{operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, LamellarArray};
+use crate::array::{
+    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, InnerArray, LamellarArray,
+};
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
@@ -49,11 +51,14 @@ use std::sync::Arc;
 
 #[doc(hidden)]
 #[enum_dispatch]
-pub trait LocalIteratorLauncher {
+pub trait LocalIteratorLauncher: InnerArray {
     fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static;
+        F: Fn(I::Item) + SyncSend + Clone + 'static,
+    {
+        self.as_inner().for_each(iter, op)
+    }
 
     fn for_each_with_schedule<I, F>(
         &self,
@@ -63,13 +68,19 @@ pub trait LocalIteratorLauncher {
     ) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static;
+        F: Fn(I::Item) + SyncSend + Clone + 'static,
+    {
+        self.as_inner().for_each_with_schedule(sched, iter, op)
+    }
 
     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static;
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.as_inner().for_each_async(iter, op)
+    }
 
     fn for_each_async_with_schedule<I, F, Fut>(
         &self,
@@ -80,13 +91,20 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static;
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.as_inner()
+            .for_each_async_with_schedule(sched, iter, op)
+    }
 
     fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static;
+        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+    {
+        self.as_inner().reduce(iter, op)
+    }
 
     fn reduce_with_schedule<I, F>(
         &self,
@@ -97,13 +115,19 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static;
+        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+    {
+        self.as_inner().reduce_with_schedule(sched, iter, op)
+    }
 
     fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.as_inner().collect(iter, d)
+    }
 
     fn collect_with_schedule<I, A>(
         &self,
@@ -114,34 +138,71 @@ pub trait LocalIteratorLauncher {
     where
         I: LocalIterator + 'static,
         I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static;
+        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.as_inner().collect_with_schedule(sched, iter, d)
+    }
 
     fn count<I>(&self, iter: &I) -> LocalIterCountHandle
     where
-        I: LocalIterator + 'static;
+        I: LocalIterator + 'static,
+    {
+        self.as_inner().count(iter)
+    }
 
     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
     where
-        I: LocalIterator + 'static;
+        I: LocalIterator + 'static,
+    {
+        self.as_inner().count_with_schedule(sched, iter)
+    }
 
     fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum;
+        I::Item: SyncSend + std::iter::Sum,
+    {
+        self.as_inner().sum(iter)
+    }
 
     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
     where
         I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum;
+        I::Item: SyncSend + std::iter::Sum,
+    {
+        self.as_inner().sum_with_schedule(sched, iter)
+    }
 
     //#[doc(hidden)]
-    fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
+    fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+        if chunk_size == 1 {
+            self.as_inner().global_index_from_local(index)
+        } else {
+            Some(
+                self.as_inner()
+                    .global_index_from_local(index * chunk_size)?
+                    / chunk_size,
+            )
+        }
+    }
 
     //#[doc(hidden)]
-    fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize>;
+    fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
+        if chunk_size == 1 {
+            self.as_inner().subarray_index_from_local(index)
+        } else {
+            Some(
+                self.as_inner()
+                    .subarray_index_from_local(index * chunk_size)?
+                    / chunk_size,
+            )
+        }
+    }
 
     //#[doc(hidden)]
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
+    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+        self.as_inner().team()
+    }
 }
 
 /// An interface for dealing with parallel local iterators (intended as the Lamellar version of the Rayon ParellelIterator trait)
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 9f41303d..1ad958f5 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -1,9 +1,7 @@
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
-use crate::array::iterator::{
-    private::*, LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
-};
+use crate::array::iterator::{private::*, LamellarArrayIterators, LamellarArrayMutIterators};
 use crate::array::local_lock_atomic::*;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
@@ -428,342 +426,5 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
 }
 
 impl<T: Dist> DistIteratorLauncher for LocalLockArray<T> {}
-//     // type Inner = Self;
-//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.global_index_from_local(index, chunk_size)
-//     }
-
-//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.subarray_index_from_local(index, chunk_size)
-//     }
-
-//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-//     // }
-
-//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each(&self.array, iter, op)
-//     }
-//     fn for_each_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-//     }
-//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
-//     }
-//     fn for_each_async_with_schedule<I, F, Fut>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce(&self.array, iter, op)
-//     }
-
-//     fn reduce_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect(&self.array, iter, d)
-//     }
-
-//     fn collect_with_schedule<I, A>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-//     }
-//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async(&self.array, iter, d)
-//     }
-
-//     fn collect_async_with_schedule<I, A, B>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-//     }
-
-//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count(&self.array, iter)
-//     }
-
-//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum(&self.array, iter)
-//     }
-
-//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.array.team_rt().clone()
-//     }
-// }
-
-impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {
-    fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.local_global_index_from_local(index, chunk_size)
-    }
-
-    fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array
-            .local_subarray_index_from_local(index, chunk_size)
-    }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    // fn reduce_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async(iter, op)
-    // }
-
-    // fn reduce_async_with_schedule<I, F, Fut>(&self, sched: Schedule, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async_with_schedule(sched, iter, op)
-    // }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-
-    // fn collect_async<I, A, B>(
-    //     &self,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async(iter, d)
-    // }
-
-    // fn collect_async_with_schedule<I, A, B>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async_with_schedule(sched, iter, d)
-    // }
-
-    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+
+impl<T: Dist> LocalIteratorLauncher for LocalLockArray<T> {}
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index 9411b110..26306db8 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -3,7 +3,7 @@ use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
     private::{IterClone, Sealed},
-    LamellarArrayIterators, LamellarArrayMutIterators, Schedule,
+    LamellarArrayIterators, LamellarArrayMutIterators,
 };
 use crate::array::native_atomic::*;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
@@ -220,342 +220,5 @@ impl<T: Dist> LamellarArrayMutIterators<T> for NativeAtomicArray<T> {
 }
 
 impl<T: Dist> DistIteratorLauncher for NativeAtomicArray<T> {}
-//     // type Inner = Self;
-//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.global_index_from_local(index, chunk_size)
-//     }
 
-//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         self.array.subarray_index_from_local(index, chunk_size)
-//     }
-
-//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-//     //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-//     // }
-
-//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each(&self.array, iter, op)
-//     }
-//     fn for_each_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-//     }
-//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async(&self.array, iter, op)
-//     }
-//     fn for_each_async_with_schedule<I, F, Fut>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce(&self.array, iter, op)
-//     }
-
-//     fn reduce_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-//     }
-
-//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect(&self.array, iter, d)
-//     }
-
-//     fn collect_with_schedule<I, A>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-//     }
-//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async(&self.array, iter, d)
-//     }
-
-//     fn collect_async_with_schedule<I, A, B>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-//     }
-
-//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count(&self.array, iter)
-//     }
-
-//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum(&self.array, iter)
-//     }
-
-//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-//     }
-
-//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.array.team_rt().clone()
-//     }
-// }
-
-impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {
-    fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.local_global_index_from_local(index, chunk_size)
-    }
-
-    fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array
-            .local_subarray_index_from_local(index, chunk_size)
-    }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    // fn reduce_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async(iter, op)
-    // }
-
-    // fn reduce_async_with_schedule<I, F, Fut>(&self, sched: Schedule, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async_with_schedule(sched, iter, op)
-    // }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-
-    // fn collect_async<I, A, B>(
-    //     &self,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async(iter, d)
-    // }
-
-    // fn collect_async_with_schedule<I, A, B>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async_with_schedule(sched, iter, d)
-    // }
-
-    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> LocalIteratorLauncher for NativeAtomicArray<T> {}
diff --git a/src/array/read_only/iteration.rs b/src/array/read_only/iteration.rs
index cbcd3615..5c83f673 100644
--- a/src/array/read_only/iteration.rs
+++ b/src/array/read_only/iteration.rs
@@ -1,7 +1,7 @@
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
-use crate::array::iterator::{LamellarArrayIterators, Schedule};
+use crate::array::iterator::LamellarArrayIterators;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::read_only::*;
 use crate::array::*;
@@ -40,342 +40,4 @@ impl<T: Dist> LamellarArrayIterators<T> for ReadOnlyArray<T> {
 }
 
 impl<T: Dist> DistIteratorLauncher for ReadOnlyArray<T> {}
-// // type Inner = Self;
-// fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//     self.array.global_index_from_local(index, chunk_size)
-// }
-
-// fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//     self.array.subarray_index_from_local(index, chunk_size)
-// }
-
-// // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-// //     self.array.subarray_pe_and_offset_for_global_index(index, chunk_size)
-// // }
-
-// fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::for_each(&self.array, iter, op)
-// }
-// fn for_each_with_schedule<I, F>(
-//     &self,
-//     sched: Schedule,
-//     iter: &I,
-//     op: F,
-// ) -> DistIterForEachHandle
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-// }
-// fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     DistIteratorLauncher::for_each_async(&self.array, iter, op)
-// }
-// fn for_each_async_with_schedule<I, F, Fut>(
-//     &self,
-//     sched: Schedule,
-//     iter: &I,
-//     op: F,
-// ) -> DistIterForEachHandle
-// where
-//     I: DistributedIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     DistIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-// }
-
-// fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Dist + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::reduce(&self.array, iter, op)
-// }
-
-// fn reduce_with_schedule<I, F>(
-//     &self,
-//     sched: Schedule,
-//     iter: &I,
-//     op: F,
-// ) -> DistIterReduceHandle<I::Item, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Dist + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-// }
-
-// fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Dist + ArrayOps,
-//     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::collect(&self.array, iter, d)
-// }
-
-// fn collect_with_schedule<I, A>(
-//     &self,
-//     sched: Schedule,
-//     iter: &I,
-//     d: Distribution,
-// ) -> DistIterCollectHandle<I::Item, A>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Dist + ArrayOps,
-//     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-// }
-// fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-// where
-//     I: DistributedIterator,
-//     I::Item: Future<Output = B> + Send + 'static,
-//     B: Dist + ArrayOps,
-//     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::collect_async(&self.array, iter, d)
-// }
-
-// fn collect_async_with_schedule<I, A, B>(
-//     &self,
-//     sched: Schedule,
-//     iter: &I,
-//     d: Distribution,
-// ) -> DistIterCollectHandle<B, A>
-// where
-//     I: DistributedIterator,
-//     I::Item: Future<Output = B> + Send + 'static,
-//     B: Dist + ArrayOps,
-//     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-// {
-//     DistIteratorLauncher::collect_async_with_schedule(&self.array, sched, iter, d)
-// }
-
-// fn count<I>(&self, iter: &I) -> DistIterCountHandle
-// where
-//     I: DistributedIterator + 'static,
-// {
-//     DistIteratorLauncher::count(&self.array, iter)
-// }
-
-// fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-// where
-//     I: DistributedIterator + 'static,
-// {
-//     DistIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-// }
-
-// fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Dist + ArrayOps + std::iter::Sum,
-// {
-//     DistIteratorLauncher::sum(&self.array, iter)
-// }
-
-// fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Dist + ArrayOps + std::iter::Sum,
-// {
-//     DistIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-// }
-
-// fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.array.team_rt().clone()
-//     }
-// }
-
-impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {
-    fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array.local_global_index_from_local(index, chunk_size)
-    }
-
-    fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-        self.array
-            .local_subarray_index_from_local(index, chunk_size)
-    }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each(&self.array, iter, op)
-    }
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::for_each_with_schedule(&self.array, sched, iter, op)
-    }
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async(&self.array, iter, op)
-    }
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        LocalIteratorLauncher::for_each_async_with_schedule(&self.array, sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce(&self.array, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::reduce_with_schedule(&self.array, sched, iter, op)
-    }
-
-    // fn reduce_async<I, F, Fut>(&self, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async(iter, op)
-    // }
-
-    // fn reduce_async_with_schedule<I, F, Fut>(&self, sched: Schedule, iter: &I, op: F) -> Pin<Box<dyn Future<Output = Option<I::Item>> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //     I::Item: SyncSend,
-    //     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static
-    // {
-    //     self.array.reduce_async_with_schedule(sched, iter, op)
-    // }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect(&self.array, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        LocalIteratorLauncher::collect_with_schedule(&self.array, sched, iter, d)
-    }
-
-    // fn collect_async<I, A, B>(
-    //     &self,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async(iter, d)
-    // }
-
-    // fn collect_async_with_schedule<I, A, B>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     I: LocalIterator + 'static,
-    //    I::Item: Future<Output = B> + Send  + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: From<UnsafeArray<B>> + SyncSend  + Clone +  'static,
-    // {
-    //     self.array.collect_async_with_schedule(sched, iter, d)
-    // }
-
-    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count(&self.array, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        LocalIteratorLauncher::count_with_schedule(&self.array, sched, iter)
-    }
-
-    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum(&self.array, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        LocalIteratorLauncher::sum_with_schedule(&self.array, sched, iter)
-    }
-
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
-    }
-}
+impl<T: Dist> LocalIteratorLauncher for ReadOnlyArray<T> {}
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index f50b12a4..17711f2c 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -10,6 +10,7 @@ use crate::active_messaging::*;
 use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
 use crate::array::*;
 use crate::array::{LamellarRead, LamellarWrite};
+use crate::barrier::BarrierHandle;
 use crate::darc::{Darc, DarcMode, WeakDarc};
 use crate::env_var::config;
 use crate::lamellae::AllocationType;
@@ -2087,6 +2088,10 @@ impl UnsafeArrayInner {
             }
         }
     }
+
+    fn barrier_handle(&self) -> BarrierHandle {
+        self.data.team.barrier.barrier_handle()
+    }
 }
 
 #[cfg(test)]
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 2bb48c05..3cae74c5 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -3,6 +3,7 @@ use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::private::Sealed;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, InnerArray};
+use crate::lamellar_request::LamellarRequest;
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -10,6 +11,7 @@ use crate::memregion::Dist;
 
 use core::marker::PhantomData;
 use futures_util::Future;
+use paste::paste;
 use std::pin::Pin;
 use std::sync::Arc;
 
@@ -26,253 +28,72 @@ impl InnerArray for UnsafeArrayInner {
 }
 
 impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {}
-//     // type Inner = Self;
-//     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         // println!("global index cs:{:?}",chunk_size);
-//         if chunk_size == 1 {
-//             self.inner.global_index_from_local(index)
-//         } else {
-//             Some(self.inner.global_index_from_local(index * chunk_size)? / chunk_size)
-//         }
-//     }
-
-//     fn subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
-//         if chunk_size == 1 {
-//             self.inner.subarray_index_from_local(index)
-//         } else {
-//             Some(self.inner.subarray_index_from_local(index * chunk_size)? / chunk_size)
-//         }
-//     }
-
-//     // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-//     //     if chunk_size == 1 {
-//     //         Some(self.calc_pe_and_offset(index))
-//     //     } else {
-//     //         Some(self.calc_pe_and_offset(index * chunk_size)? / chunk_size)
-//     //     }
-//     // }
-
-//     fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         self.for_each_with_schedule(Schedule::Static, iter, op)
-//     }
-
-//     fn for_each_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) + SyncSend + Clone + 'static,
-//     {
-//         let for_each = ForEach {
-//             iter: iter.iter_clone(Sealed),
-//             op,
-//         };
-//         self.barrier();
-//         match sched {
-//             Schedule::Static => self.inner.sched_static(for_each),
-//             Schedule::Dynamic => self.inner.sched_dynamic(for_each),
-//             Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
-//             Schedule::Guided => self.inner.sched_guided(for_each),
-//             Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
-//         }
-//     }
-
-//     fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         self.for_each_async_with_schedule(Schedule::Static, iter, op)
-//     }
-
-//     fn for_each_async_with_schedule<I, F, Fut>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterForEachHandle
-//     where
-//         I: DistributedIterator + 'static,
-//         F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//         Fut: Future<Output = ()> + Send + 'static,
-//     {
-//         let for_each = ForEachAsync {
-//             iter: iter.iter_clone(Sealed),
-//             op,
-//         };
-//         self.barrier();
-//         match sched {
-//             Schedule::Static => self.inner.sched_static(for_each),
-//             Schedule::Dynamic => self.inner.sched_dynamic(for_each),
-//             Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
-//             Schedule::Guided => self.inner.sched_guided(for_each),
-//             Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
-//         }
-//     }
-
-//     fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         self.reduce_with_schedule(Schedule::Static, iter, op)
-//     }
-
-//     fn reduce_with_schedule<I, F>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         op: F,
-//     ) -> DistIterReduceHandle<I::Item, F>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-//     {
-//         let reduce = Reduce {
-//             iter: iter.iter_clone(Sealed),
-//             op,
-//         };
-//         match sched {
-//             Schedule::Static => self.inner.sched_static(reduce),
-//             Schedule::Dynamic => self.inner.sched_dynamic(reduce),
-//             Schedule::Chunk(size) => self.inner.sched_chunk(reduce, size),
-//             Schedule::Guided => self.inner.sched_guided(reduce),
-//             Schedule::WorkStealing => self.inner.sched_work_stealing(reduce),
-//         }
-//     }
-
-//     fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         self.collect_with_schedule(Schedule::Static, iter, d)
-//     }
-
-//     fn collect_with_schedule<I, A>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<I::Item, A>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         let collect = Collect {
-//             iter: iter.iter_clone(Sealed).monotonic(),
-//             distribution: d,
-//             _phantom: PhantomData,
-//         };
-//         match sched {
-//             Schedule::Static => self.inner.sched_static(collect),
-//             Schedule::Dynamic => self.inner.sched_dynamic(collect),
-//             Schedule::Chunk(size) => self.inner.sched_chunk(collect, size),
-//             Schedule::Guided => self.inner.sched_guided(collect),
-//             Schedule::WorkStealing => self.inner.sched_work_stealing(collect),
-//         }
-//     }
-
-//     fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         self.collect_async_with_schedule(Schedule::Static, iter, d)
-//     }
-
-//     fn collect_async_with_schedule<I, A, B>(
-//         &self,
-//         sched: Schedule,
-//         iter: &I,
-//         d: Distribution,
-//     ) -> DistIterCollectHandle<B, A>
-//     where
-//         I: DistributedIterator,
-//         I::Item: Future<Output = B> + Send + 'static,
-//         B: Dist + ArrayOps,
-//         A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-//     {
-//         let collect = CollectAsync {
-//             iter: iter.iter_clone(Sealed).monotonic(),
-//             distribution: d,
-//             _phantom: PhantomData,
-//         };
-//         match sched {
-//             Schedule::Static => self.inner.sched_static(collect),
-//             Schedule::Dynamic => self.inner.sched_dynamic(collect),
-//             Schedule::Chunk(size) => self.inner.sched_chunk(collect, size),
-//             Schedule::Guided => self.inner.sched_guided(collect),
-//             Schedule::WorkStealing => self.inner.sched_work_stealing(collect),
-//         }
-//     }
-
-//     fn count<I>(&self, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         self.count_with_schedule(Schedule::Static, iter)
-//     }
-
-//     fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-//     where
-//         I: DistributedIterator + 'static,
-//     {
-//         let count = Count {
-//             iter: iter.iter_clone(Sealed),
-//         };
-//         match sched {
-//             Schedule::Static => self.inner.sched_static(count),
-//             Schedule::Dynamic => self.inner.sched_dynamic(count),
-//             Schedule::Chunk(size) => self.inner.sched_chunk(count, size),
-//             Schedule::Guided => self.inner.sched_guided(count),
-//             Schedule::WorkStealing => self.inner.sched_work_stealing(count),
-//         }
-//     }
-
-//     fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         self.sum_with_schedule(Schedule::Static, iter)
-//     }
-
-//     fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-//     where
-//         I: DistributedIterator + 'static,
-//         I::Item: Dist + ArrayOps + std::iter::Sum,
-//     {
-//         let sum = Sum {
-//             iter: iter.iter_clone(Sealed),
-//         };
-//         match sched {
-//             Schedule::Static => self.inner.sched_static(sum),
-//             Schedule::Dynamic => self.inner.sched_dynamic(sum),
-//             Schedule::Chunk(size) => self.inner.sched_chunk(sum, size),
-//             Schedule::Guided => self.inner.sched_guided(sum),
-//             Schedule::WorkStealing => self.inner.sched_work_stealing(sum),
-//         }
-//     }
-
-//     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-//         self.inner.data.team.clone()
-//     }
-// }
+
+macro_rules! consumer_impl {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [$(-> $($blocking_ret:tt)*)?] ) => {
+        paste! {
+            fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $return_type$(<$($ret_gen),*>)?
+            where
+            $($bounds)+
+            {
+
+                self.[<$name _with_schedule>](Schedule::Static, $($arg),*)
+            }
+
+
+            fn [<$name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            ) ->   $return_type$(<$($ret_gen),*>)?
+            where
+                $($bounds)+
+            {
+                let am = $($am)*;
+                let barrier = self.barrier_handle();
+                let inner = self.clone();
+                let reqs_future = Box::pin(async move{match sched {
+                    Schedule::Static => inner.sched_static(am),
+                    Schedule::Dynamic => inner.sched_dynamic(am),
+                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
+                    Schedule::Guided => inner.sched_guided(am),
+                    Schedule::WorkStealing => inner.sched_work_stealing(am),
+                }});
+                $return_type::new(barrier,reqs_future,self)
+            }
+
+            fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) $(-> $($blocking_ret)*)?
+            where
+            $($bounds)+
+            {
+
+                self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
+            }
+
+
+            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            ) $(-> $($blocking_ret)*)?
+            where
+                $($bounds)+
+            {
+                let am = $($am)*;
+                self.data.team.barrier.tasking_barrier();
+                let inner = self.clone();
+                let reqs = match sched {
+                    Schedule::Static => inner.sched_static(am),
+                    Schedule::Dynamic => inner.sched_dynamic(am),
+                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
+                    Schedule::Guided => inner.sched_guided(am),
+                    Schedule::WorkStealing => inner.sched_work_stealing(am),
+                };
+                reqs.blocking_wait()
+            }
+        }
+    };
+}
 
 impl DistIteratorLauncher for UnsafeArrayInner {
     // type Inner = Self;
@@ -293,230 +114,317 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         }
     }
 
-    // fn subarray_pe_and_offset_for_global_index(&self, index: usize, chunk_size: usize) -> Option<(usize,usize)> {
-    //     if chunk_size == 1 {
-    //         Some(self.calc_pe_and_offset(index))
-    //     } else {
-    //         Some(self.calc_pe_and_offset(index * chunk_size)? / chunk_size)
-    //     }
-    // }
-
-    fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        self.for_each_with_schedule(Schedule::Static, iter, op)
-    }
-
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        let for_each = ForEach {
+    consumer_impl!(
+    for_each<I, F>(iter: &I, op: F); 
+    [DistIterForEachHandle];
+    [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
+    [
+        ForEach {
             iter: iter.iter_clone(Sealed),
             op,
-        };
-        self.team().barrier();
-        match sched {
-            Schedule::Static => self.sched_static(for_each),
-            Schedule::Dynamic => self.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-            Schedule::Guided => self.sched_guided(for_each),
-            Schedule::WorkStealing => self.sched_work_stealing(for_each),
         }
-    }
+    ];
+    []);
+
+    consumer_impl!(
+        for_each_async<I, F, Fut>(iter: &I, op: F); 
+        [DistIterForEachHandle];
+        [I: DistributedIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
+        [
+            ForEachAsync {
+                iter: iter.iter_clone(Sealed),
+                op,
+            }
+        ];
+        []
+    );
+
+    // fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
+    // where
+    //     I: DistributedIterator + 'static,
+    //     F: Fn(I::Item) + SyncSend + Clone + 'static,
+    // {
+    //     self.for_each_with_schedule(Schedule::Static, iter, op)
+    // }
 
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.for_each_async_with_schedule(Schedule::Static, iter, op)
-    }
+    // fn for_each_with_schedule<I, F>(
+    //     &self,
+    //     sched: Schedule,
+    //     iter: &I,
+    //     op: F,
+    // ) -> DistIterForEachHandle
+    // where
+    //     I: DistributedIterator + 'static,
+    //     F: Fn(I::Item) + SyncSend + Clone + 'static,
+    // {
+    //     let for_each = ForEach {
+    //         iter: iter.iter_clone(Sealed),
+    //         op,
+    //     };
+    //     self.team().barrier();
+    //     match sched {
+    //         Schedule::Static => self.sched_static(for_each),
+    //         Schedule::Dynamic => self.sched_dynamic(for_each),
+    //         Schedule::Chunk(size) => self.sched_chunk(for_each, size),
+    //         Schedule::Guided => self.sched_guided(for_each),
+    //         Schedule::WorkStealing => self.sched_work_stealing(for_each),
+    //     }
+    // }
 
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterForEachHandle
-    where
-        I: DistributedIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        let for_each = ForEachAsync {
-            iter: iter.iter_clone(Sealed),
-            op,
-        };
-        self.team().barrier();
-        match sched {
-            Schedule::Static => self.sched_static(for_each),
-            Schedule::Dynamic => self.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-            Schedule::Guided => self.sched_guided(for_each),
-            Schedule::WorkStealing => self.sched_work_stealing(for_each),
-        }
-    }
+    // fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
+    // where
+    //     I: DistributedIterator + 'static,
+    //     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     self.for_each_async_with_schedule(Schedule::Static, iter, op)
+    // }
 
-    fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        self.reduce_with_schedule(Schedule::Static, iter, op)
-    }
+    // fn for_each_async_with_schedule<I, F, Fut>(
+    //     &self,
+    //     sched: Schedule,
+    //     iter: &I,
+    //     op: F,
+    // ) -> DistIterForEachHandle
+    // where
+    //     I: DistributedIterator + 'static,
+    //     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     let for_each = ForEachAsync {
+    //         iter: iter.iter_clone(Sealed),
+    //         op,
+    //     };
+    //     self.team().barrier();
+    //     match sched {
+    //         Schedule::Static => self.sched_static(for_each),
+    //         Schedule::Dynamic => self.sched_dynamic(for_each),
+    //         Schedule::Chunk(size) => self.sched_chunk(for_each, size),
+    //         Schedule::Guided => self.sched_guided(for_each),
+    //         Schedule::WorkStealing => self.sched_work_stealing(for_each),
+    //     }
+    // }
 
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> DistIterReduceHandle<I::Item, F>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        let reduce = Reduce {
-            iter: iter.iter_clone(Sealed),
-            op,
-        };
-        match sched {
-            Schedule::Static => self.sched_static(reduce),
-            Schedule::Dynamic => self.sched_dynamic(reduce),
-            Schedule::Chunk(size) => self.sched_chunk(reduce, size),
-            Schedule::Guided => self.sched_guided(reduce),
-            Schedule::WorkStealing => self.sched_work_stealing(reduce),
-        }
-    }
+    consumer_impl!(
+        reduce<I, F>( iter: &I, op: F); 
+        [DistIterReduceHandle<I::Item, F>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
+        [
+            Reduce {
+                iter: iter.iter_clone(Sealed),
+                op,
+            }
+        ];
+        [-> Option<I::Item>]);
+
+    // consumer_impl!(
+    //     reduce_async<I, T,F>( iter: &I, op: F); 
+    //     [DistIterReduceHandle<T, F>];
+    //     [I: DistributedIterator + 'static, I::Item: Future<Output = T> + Send + 'static, T: Dist + Send + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,];
+    //     [
+    //         ReduceAsync {
+    //             iter: iter.iter_clone(Sealed),
+    //             op,
+    //             // _phantom: PhantomData,
+    //         }
+    //     ];
+    //     [-> Option<T>]);
+
+    // fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
+    // where
+    //     I: DistributedIterator + 'static,
+    //     I::Item: Dist + ArrayOps,
+    //     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+    // {
+    //     self.reduce_with_schedule(Schedule::Static, iter, op)
+    // }
 
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.collect_with_schedule(Schedule::Static, iter, d)
-    }
+    // fn reduce_with_schedule<I, F>(
+    //     &self,
+    //     sched: Schedule,
+    //     iter: &I,
+    //     op: F,
+    // ) -> DistIterReduceHandle<I::Item, F>
+    // where
+    //     I: DistributedIterator + 'static,
+    //     I::Item: Dist + ArrayOps,
+    //     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
+    // {
+    //     let reduce = Reduce {
+    //         iter: iter.iter_clone(Sealed),
+    //         op,
+    //     };
+    //     match sched {
+    //         Schedule::Static => self.sched_static(reduce),
+    //         Schedule::Dynamic => self.sched_dynamic(reduce),
+    //         Schedule::Chunk(size) => self.sched_chunk(reduce, size),
+    //         Schedule::Guided => self.sched_guided(reduce),
+    //         Schedule::WorkStealing => self.sched_work_stealing(reduce),
+    //     }
+    // }
 
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<I::Item, A>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        let collect = Collect {
-            iter: iter.iter_clone(Sealed).monotonic(),
-            distribution: d,
-            _phantom: PhantomData,
-        };
-        match sched {
-            Schedule::Static => self.sched_static(collect),
-            Schedule::Dynamic => self.sched_dynamic(collect),
-            Schedule::Chunk(size) => self.sched_chunk(collect, size),
-            Schedule::Guided => self.sched_guided(collect),
-            Schedule::WorkStealing => self.sched_work_stealing(collect),
-        }
-    }
+    consumer_impl!(
+        collect<I, A>( iter: &I, d: Distribution); 
+        [DistIterCollectHandle<I::Item, A>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
+        [
+            Collect {
+                iter: iter.iter_clone(Sealed).monotonic(),
+                distribution: d,
+                _phantom: PhantomData,
+            }
+        ];
+        [-> A]);
+    consumer_impl!(
+        collect_async<I, A, B>( iter: &I, d: Distribution); 
+        [DistIterCollectHandle<B, A>];
+        [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
+        [
+            CollectAsync {
+                iter: iter.iter_clone(Sealed).monotonic(),
+                distribution: d,
+                _phantom: PhantomData,
+            }
+        ];
+        [-> A]);
+
+    // fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
+    // where
+    //     I: DistributedIterator + 'static,
+    //     I::Item: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.collect_with_schedule(Schedule::Static, iter, d)
+    // }
 
-    fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.collect_async_with_schedule(Schedule::Static, iter, d)
-    }
+    // fn collect_with_schedule<I, A>(
+    //     &self,
+    //     sched: Schedule,
+    //     iter: &I,
+    //     d: Distribution,
+    // ) -> DistIterCollectHandle<I::Item, A>
+    // where
+    //     I: DistributedIterator + 'static,
+    //     I::Item: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     let collect = Collect {
+    //         iter: iter.iter_clone(Sealed).monotonic(),
+    //         distribution: d,
+    //         _phantom: PhantomData,
+    //     };
+    //     match sched {
+    //         Schedule::Static => self.sched_static(collect),
+    //         Schedule::Dynamic => self.sched_dynamic(collect),
+    //         Schedule::Chunk(size) => self.sched_chunk(collect, size),
+    //         Schedule::Guided => self.sched_guided(collect),
+    //         Schedule::WorkStealing => self.sched_work_stealing(collect),
+    //     }
+    // }
 
-    fn collect_async_with_schedule<I, A, B>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> DistIterCollectHandle<B, A>
-    where
-        I: DistributedIterator,
-        I::Item: Future<Output = B> + Send + 'static,
-        B: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        let collect = CollectAsync {
-            iter: iter.iter_clone(Sealed).monotonic(),
-            distribution: d,
-            _phantom: PhantomData,
-        };
-        match sched {
-            Schedule::Static => self.sched_static(collect),
-            Schedule::Dynamic => self.sched_dynamic(collect),
-            Schedule::Chunk(size) => self.sched_chunk(collect, size),
-            Schedule::Guided => self.sched_guided(collect),
-            Schedule::WorkStealing => self.sched_work_stealing(collect),
-        }
-    }
+    // fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
+    // where
+    //     I: DistributedIterator,
+    //     I::Item: Future<Output = B> + Send + 'static,
+    //     B: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.collect_async_with_schedule(Schedule::Static, iter, d)
+    // }
 
-    fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        self.count_with_schedule(Schedule::Static, iter)
-    }
+    // fn collect_async_with_schedule<I, A, B>(
+    //     &self,
+    //     sched: Schedule,
+    //     iter: &I,
+    //     d: Distribution,
+    // ) -> DistIterCollectHandle<B, A>
+    // where
+    //     I: DistributedIterator,
+    //     I::Item: Future<Output = B> + Send + 'static,
+    //     B: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     let collect = CollectAsync {
+    //         iter: iter.iter_clone(Sealed).monotonic(),
+    //         distribution: d,
+    //         _phantom: PhantomData,
+    //     };
+    //     match sched {
+    //         Schedule::Static => self.sched_static(collect),
+    //         Schedule::Dynamic => self.sched_dynamic(collect),
+    //         Schedule::Chunk(size) => self.sched_chunk(collect, size),
+    //         Schedule::Guided => self.sched_guided(collect),
+    //         Schedule::WorkStealing => self.sched_work_stealing(collect),
+    //     }
+    // }
+    consumer_impl!(
+        count<I>( iter: &I); 
+        [DistIterCountHandle];
+        [I: DistributedIterator + 'static ];
+        [
+            Count {
+                iter: iter.iter_clone(Sealed),
+            }
+        ];
+        [-> usize]);
+    // fn count<I>(&self, iter: &I) -> DistIterCountHandle
+    // where
+    //     I: DistributedIterator + 'static,
+    // {
+    //     self.count_with_schedule(Schedule::Static, iter)
+    // }
 
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    where
-        I: DistributedIterator + 'static,
-    {
-        let count = Count {
-            iter: iter.iter_clone(Sealed),
-        };
-        match sched {
-            Schedule::Static => self.sched_static(count),
-            Schedule::Dynamic => self.sched_dynamic(count),
-            Schedule::Chunk(size) => self.sched_chunk(count, size),
-            Schedule::Guided => self.sched_guided(count),
-            Schedule::WorkStealing => self.sched_work_stealing(count),
-        }
-    }
+    // fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
+    // where
+    //     I: DistributedIterator + 'static,
+    // {
+    //     let count = Count {
+    //         iter: iter.iter_clone(Sealed),
+    //     };
+    //     match sched {
+    //         Schedule::Static => self.sched_static(count),
+    //         Schedule::Dynamic => self.sched_dynamic(count),
+    //         Schedule::Chunk(size) => self.sched_chunk(count, size),
+    //         Schedule::Guided => self.sched_guided(count),
+    //         Schedule::WorkStealing => self.sched_work_stealing(count),
+    //     }
+    // }
 
-    fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        self.sum_with_schedule(Schedule::Static, iter)
-    }
+    consumer_impl!(
+        sum<I>(iter: &I); 
+        [DistIterSumHandle<I::Item>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
+        [
+            Sum {
+                iter: iter.iter_clone(Sealed),
+            }
+        ];
+        [-> I::Item]);
+
+    // fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
+    // where
+    //     I: DistributedIterator + 'static,
+    //     I::Item: Dist + ArrayOps + std::iter::Sum,
+    // {
+    //     self.sum_with_schedule(Schedule::Static, iter)
+    // }
 
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    where
-        I: DistributedIterator + 'static,
-        I::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        let sum = Sum {
-            iter: iter.iter_clone(Sealed),
-        };
-        match sched {
-            Schedule::Static => self.sched_static(sum),
-            Schedule::Dynamic => self.sched_dynamic(sum),
-            Schedule::Chunk(size) => self.sched_chunk(sum, size),
-            Schedule::Guided => self.sched_guided(sum),
-            Schedule::WorkStealing => self.sched_work_stealing(sum),
-        }
-    }
+    // fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
+    // where
+    //     I: DistributedIterator + 'static,
+    //     I::Item: Dist + ArrayOps + std::iter::Sum,
+    // {
+    //     let sum = Sum {
+    //         iter: iter.iter_clone(Sealed),
+    //     };
+    //     match sched {
+    //         Schedule::Static => self.sched_static(sum),
+    //         Schedule::Dynamic => self.sched_dynamic(sum),
+    //         Schedule::Chunk(size) => self.sched_chunk(sum, size),
+    //         Schedule::Guided => self.sched_guided(sum),
+    //         Schedule::WorkStealing => self.sched_work_stealing(sum),
+    //     }
+    // }
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 1b433afc..15b88162 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::private::*;
-use crate::array::r#unsafe::UnsafeArray;
+use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
 
 use crate::array::iterator::Schedule;
@@ -13,21 +13,23 @@ use futures_util::Future;
 use std::pin::Pin;
 use std::sync::Arc;
 
-impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
+impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {}
+
+impl LocalIteratorLauncher for UnsafeArrayInner {
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
         // println!("global index cs:{:?}",chunk_size);
         if chunk_size == 1 {
-            self.inner.global_index_from_local(index)
+            self.global_index_from_local(index)
         } else {
-            Some(self.inner.global_index_from_local(index * chunk_size)? / chunk_size)
+            Some(self.global_index_from_local(index * chunk_size)? / chunk_size)
         }
     }
 
     fn local_subarray_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
         if chunk_size == 1 {
-            self.inner.subarray_index_from_local(index)
+            self.subarray_index_from_local(index)
         } else {
-            Some(self.inner.subarray_index_from_local(index * chunk_size)? / chunk_size)
+            Some(self.subarray_index_from_local(index * chunk_size)? / chunk_size)
         }
     }
 
@@ -54,11 +56,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             op,
         };
         match sched {
-            Schedule::Static => self.inner.sched_static(for_each),
-            Schedule::Dynamic => self.inner.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
-            Schedule::Guided => self.inner.sched_guided(for_each),
-            Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
+            Schedule::Static => self.sched_static(for_each),
+            Schedule::Dynamic => self.sched_dynamic(for_each),
+            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
+            Schedule::Guided => self.sched_guided(for_each),
+            Schedule::WorkStealing => self.sched_work_stealing(for_each),
         }
     }
 
@@ -87,11 +89,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             op: op.clone(),
         };
         match sched {
-            Schedule::Static => self.inner.sched_static(for_each),
-            Schedule::Dynamic => self.inner.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.inner.sched_chunk(for_each, size),
-            Schedule::Guided => self.inner.sched_guided(for_each),
-            Schedule::WorkStealing => self.inner.sched_work_stealing(for_each),
+            Schedule::Static => self.sched_static(for_each),
+            Schedule::Dynamic => self.sched_dynamic(for_each),
+            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
+            Schedule::Guided => self.sched_guided(for_each),
+            Schedule::WorkStealing => self.sched_work_stealing(for_each),
         }
     }
 
@@ -120,11 +122,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             op,
         };
         match sched {
-            Schedule::Static => self.inner.sched_static(reduce),
-            Schedule::Dynamic => self.inner.sched_dynamic(reduce),
-            Schedule::Chunk(size) => self.inner.sched_chunk(reduce, size),
-            Schedule::Guided => self.inner.sched_guided(reduce),
-            Schedule::WorkStealing => self.inner.sched_work_stealing(reduce),
+            Schedule::Static => self.sched_static(reduce),
+            Schedule::Dynamic => self.sched_dynamic(reduce),
+            Schedule::Chunk(size) => self.sched_chunk(reduce, size),
+            Schedule::Guided => self.sched_guided(reduce),
+            Schedule::WorkStealing => self.sched_work_stealing(reduce),
         }
     }
 
@@ -154,11 +156,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             _phantom: PhantomData,
         };
         match sched {
-            Schedule::Static => self.inner.sched_static(collect),
-            Schedule::Dynamic => self.inner.sched_dynamic(collect),
-            Schedule::Chunk(size) => self.inner.sched_chunk(collect, size),
-            Schedule::Guided => self.inner.sched_guided(collect),
-            Schedule::WorkStealing => self.inner.sched_work_stealing(collect),
+            Schedule::Static => self.sched_static(collect),
+            Schedule::Dynamic => self.sched_dynamic(collect),
+            Schedule::Chunk(size) => self.sched_chunk(collect, size),
+            Schedule::Guided => self.sched_guided(collect),
+            Schedule::WorkStealing => self.sched_work_stealing(collect),
         }
     }
 
@@ -177,11 +179,11 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             iter: iter.iter_clone(Sealed),
         };
         match sched {
-            Schedule::Static => self.inner.sched_static(count),
-            Schedule::Dynamic => self.inner.sched_dynamic(count),
-            Schedule::Chunk(size) => self.inner.sched_chunk(count, size),
-            Schedule::Guided => self.inner.sched_guided(count),
-            Schedule::WorkStealing => self.inner.sched_work_stealing(count),
+            Schedule::Static => self.sched_static(count),
+            Schedule::Dynamic => self.sched_dynamic(count),
+            Schedule::Chunk(size) => self.sched_chunk(count, size),
+            Schedule::Guided => self.sched_guided(count),
+            Schedule::WorkStealing => self.sched_work_stealing(count),
         }
     }
 
@@ -202,15 +204,15 @@ impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {
             iter: iter.iter_clone(Sealed),
         };
         match sched {
-            Schedule::Static => self.inner.sched_static(sum),
-            Schedule::Dynamic => self.inner.sched_dynamic(sum),
-            Schedule::Chunk(size) => self.inner.sched_chunk(sum, size),
-            Schedule::Guided => self.inner.sched_guided(sum),
-            Schedule::WorkStealing => self.inner.sched_work_stealing(sum),
+            Schedule::Static => self.sched_static(sum),
+            Schedule::Dynamic => self.sched_dynamic(sum),
+            Schedule::Chunk(size) => self.sched_chunk(sum, size),
+            Schedule::Guided => self.sched_guided(sum),
+            Schedule::WorkStealing => self.sched_work_stealing(sum),
         }
     }
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.inner.data.team.clone()
+        self.data.team.clone()
     }
 }
diff --git a/src/barrier.rs b/src/barrier.rs
index 9ab3ca7f..6b771901 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -1,11 +1,16 @@
 use crate::env_var::config;
 use crate::lamellae::{AllocationType, Lamellae, LamellaeRDMA};
 use crate::lamellar_arch::LamellarArchRT;
+use crate::lamellar_request::LamellarRequest;
 use crate::memregion::MemoryRegion;
 use crate::scheduler::Scheduler;
 
+use futures_util::Future;
+use pin_project::pin_project;
+use std::pin::Pin;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::task::{Context, Poll, Waker};
 use std::time::Instant;
 
 pub(crate) struct Barrier {
@@ -17,7 +22,7 @@ pub(crate) struct Barrier {
     pub(crate) scheduler: Arc<Scheduler>,
     lamellae: Arc<Lamellae>,
     barrier_cnt: AtomicUsize,
-    barrier_buf: Vec<MemoryRegion<usize>>,
+    barrier_buf: Arc<Vec<MemoryRegion<usize>>>,
     send_buf: Option<MemoryRegion<usize>>,
     panic: Arc<AtomicU8>,
 }
@@ -88,7 +93,7 @@ impl Barrier {
             scheduler,
             lamellae,
             barrier_cnt: AtomicUsize::new(1),
-            barrier_buf: buffs,
+            barrier_buf: Arc::new(buffs),
             send_buf,
             panic,
         };
@@ -297,107 +302,146 @@ impl Barrier {
         });
     }
 
-    pub(crate) async fn async_barrier(&self) {
-        let mut s = Instant::now();
+    pub(crate) fn barrier_handle(&self) -> BarrierHandle {
+        let mut handle = BarrierHandle {
+            barrier_buf: self.barrier_buf.clone(),
+            arch: self.arch.clone(),
+            lamellae: self.lamellae.clone(),
+            my_index: 0,
+            num_pes: self.num_pes,
+            barrier_id: 0,
+            num_rounds: self.num_rounds,
+            n: self.n,
+            state: State::RoundInit(self.num_rounds),
+        };
         if self.panic.load(Ordering::SeqCst) == 0 {
-            if let Some(send_buf) = &self.send_buf {
+            if let Some(_) = &self.send_buf {
                 if let Ok(my_index) = self.arch.team_pe(self.my_pe) {
-                    let send_buf_slice = unsafe {
-                        // im the only thread (remote or local) that can write to this buff
-                        send_buf.as_mut_slice().expect("Data should exist on PE")
-                    };
-
                     let barrier_id = self.barrier_cnt.fetch_add(1, Ordering::SeqCst);
-                    send_buf_slice[0] = barrier_id;
-                    let barrier_slice = &[barrier_id];
-                    // println!(
-                    //     "[{:?}] barrier_id = {:?}",
-                    //     std::thread::current().id(),
-                    //     barrier_id
-                    // );
-
-                    for round in 0..self.num_rounds {
-                        for i in 1..=self.n {
-                            let team_send_pe =
-                                (my_index + i * (self.n + 1).pow(round as u32)) % self.num_pes;
-                            if team_send_pe != my_index {
-                                let send_pe = self.arch.single_iter(team_send_pe).next().unwrap();
-                                // println!(
-                                //     "[{:?}][ {:?} {:?}] round: {:?}  i: {:?} sending to [{:?} ({:?}) ] id: {:?} buf {:?}",
-                                //     std::thread::current().id(),
-                                //     self.my_pe,
-                                //     my_index,
-                                //     round,
-                                //     i,
-                                //     send_pe,
-                                //     team_send_pe,
-                                //     send_buf_slice,
-                                //     unsafe {
-                                //         self.barrier_buf[i - 1]
-                                //             .as_mut_slice()
-                                //             .expect("Data should exist on PE")
-                                //     }
-                                // );
-                                // println!("barrier put_slice 2");
-                                unsafe {
-                                    self.barrier_buf[i - 1].put_slice(
-                                        send_pe,
-                                        round,
-                                        barrier_slice,
-                                    );
-                                    //safe as we are the only ones writing to our index
-                                }
-                            }
-                        }
-                        for i in 1..=self.n {
-                            let team_recv_pe = ((my_index as isize
-                                - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
-                                as isize)
-                                .rem_euclid(self.num_pes as isize)
-                                as isize;
-                            let recv_pe =
-                                self.arch.single_iter(team_recv_pe as usize).next().unwrap();
-                            if team_recv_pe as usize != my_index {
-                                // println!(
-                                //     "[{:?}][{:?} ] recv from [{:?} ({:?}) ] id: {:?} buf {:?}",
-                                //     std::thread::current().id(),
-                                //     self.my_pe,
-                                //     recv_pe,
-                                //     team_recv_pe,
-                                //     send_buf_slice,
-                                //     unsafe {
-                                //         self.barrier_buf[i - 1]
-                                //             .as_mut_slice()
-                                //             .expect("Data should exist on PE")
-                                //     }
-                                // );
-                                unsafe {
-                                    //safe as  each pe is only capable of writing to its own index
-                                    while self.barrier_buf[i - 1]
-                                        .as_mut_slice()
-                                        .expect("Data should exist on PE")[round]
-                                        < barrier_id
-                                    {
-                                        self.barrier_timeout(
-                                            &mut s,
-                                            my_index,
-                                            round,
-                                            i,
-                                            team_recv_pe,
-                                            recv_pe,
-                                            send_buf_slice,
-                                        );
-                                        self.lamellae.flush();
-                                        async_std::task::yield_now().await;
-                                    }
-                                }
-                            }
+                    handle.barrier_id = barrier_id;
+                    handle.my_index = my_index;
+                    handle.state = State::RoundInit(0);
+                    let mut round = 0;
+                    while round < self.num_rounds {
+                        handle.do_send_round(round);
+                        if let Some(recv_pe) = handle.do_recv_round(round, 1) {
+                            handle.state = State::RoundInProgress(round, recv_pe);
+                            return handle;
                         }
+                        round += 1;
                     }
+                    handle.state = State::RoundInit(self.num_rounds);
                 }
             }
         }
+        handle
     }
+
+    pub(crate) async fn async_barrier(&self) {
+        self.barrier_handle().await;
+    }
+
+    // pub(crate) async fn async_barrier(&self) {
+    //     let mut s = Instant::now();
+    //     if self.panic.load(Ordering::SeqCst) == 0 {
+    //         if let Some(send_buf) = &self.send_buf {
+    //             if let Ok(my_index) = self.arch.team_pe(self.my_pe) {
+    //                 let send_buf_slice = unsafe {
+    //                     // im the only thread (remote or local) that can write to this buff
+    //                     send_buf.as_mut_slice().expect("Data should exist on PE")
+    //                 };
+
+    //                 let barrier_id = self.barrier_cnt.fetch_add(1, Ordering::SeqCst);
+    //                 send_buf_slice[0] = barrier_id;
+    //                 let barrier_slice = &[barrier_id];
+    //                 // println!(
+    //                 //     "[{:?}] barrier_id = {:?}",
+    //                 //     std::thread::current().id(),
+    //                 //     barrier_id
+    //                 // );
+
+    //                 for round in 0..self.num_rounds {
+    //                     for i in 1..=self.n {
+    //                         let team_send_pe =
+    //                             (my_index + i * (self.n + 1).pow(round as u32)) % self.num_pes;
+    //                         if team_send_pe != my_index {
+    //                             let send_pe = self.arch.single_iter(team_send_pe).next().unwrap();
+    //                             // println!(
+    //                             //     "[{:?}][ {:?} {:?}] round: {:?}  i: {:?} sending to [{:?} ({:?}) ] id: {:?} buf {:?}",
+    //                             //     std::thread::current().id(),
+    //                             //     self.my_pe,
+    //                             //     my_index,
+    //                             //     round,
+    //                             //     i,
+    //                             //     send_pe,
+    //                             //     team_send_pe,
+    //                             //     send_buf_slice,
+    //                             //     unsafe {
+    //                             //         self.barrier_buf[i - 1]
+    //                             //             .as_mut_slice()
+    //                             //             .expect("Data should exist on PE")
+    //                             //     }
+    //                             // );
+    //                             // println!("barrier put_slice 2");
+    //                             unsafe {
+    //                                 self.barrier_buf[i - 1].put_slice(
+    //                                     send_pe,
+    //                                     round,
+    //                                     barrier_slice,
+    //                                 );
+    //                                 //safe as we are the only ones writing to our index
+    //                             }
+    //                         }
+    //                     }
+    //                     for i in 1..=self.n {
+    //                         let team_recv_pe = ((my_index as isize
+    //                             - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
+    //                             as isize)
+    //                             .rem_euclid(self.num_pes as isize)
+    //                             as isize;
+    //                         let recv_pe =
+    //                             self.arch.single_iter(team_recv_pe as usize).next().unwrap();
+    //                         if team_recv_pe as usize != my_index {
+    //                             // println!(
+    //                             //     "[{:?}][{:?} ] recv from [{:?} ({:?}) ] id: {:?} buf {:?}",
+    //                             //     std::thread::current().id(),
+    //                             //     self.my_pe,
+    //                             //     recv_pe,
+    //                             //     team_recv_pe,
+    //                             //     send_buf_slice,
+    //                             //     unsafe {
+    //                             //         self.barrier_buf[i - 1]
+    //                             //             .as_mut_slice()
+    //                             //             .expect("Data should exist on PE")
+    //                             //     }
+    //                             // );
+    //                             unsafe {
+    //                                 //safe as  each pe is only capable of writing to its own index
+    //                                 while self.barrier_buf[i - 1]
+    //                                     .as_mut_slice()
+    //                                     .expect("Data should exist on PE")[round]
+    //                                     < barrier_id
+    //                                 {
+    //                                     self.barrier_timeout(
+    //                                         &mut s,
+    //                                         my_index,
+    //                                         round,
+    //                                         i,
+    //                                         team_recv_pe,
+    //                                         recv_pe,
+    //                                         send_buf_slice,
+    //                                     );
+    //                                     self.lamellae.flush();
+    //                                     async_std::task::yield_now().await;
+    //                                 }
+    //                             }
+    //                         }
+    //                     }
+    //                 }
+    //             }
+    //         }
+    //     }
+    // }
 }
 
 // impl Drop for Barrier {
@@ -407,3 +451,192 @@ impl Barrier {
 //         //println!("dropped barrier");
 //     }
 // }
+
+#[pin_project]
+pub(crate) struct BarrierHandle {
+    barrier_buf: Arc<Vec<MemoryRegion<usize>>>,
+    arch: Arc<LamellarArchRT>,
+    lamellae: Arc<Lamellae>,
+    my_index: usize,
+    num_pes: usize,
+    barrier_id: usize,
+    num_rounds: usize,
+    n: usize,
+    state: State,
+}
+
+enum State {
+    RoundInit(usize),              //the round we are in
+    RoundInProgress(usize, usize), //the round we are in, pe we are waiting to hear from
+}
+
+impl BarrierHandle {
+    fn do_send_round(&self, round: usize) {
+        let barrier_slice = &[self.barrier_id];
+        for i in 1..=self.n {
+            let team_send_pe = (self.my_index + i * (self.n + 1).pow(round as u32)) % self.num_pes;
+            if team_send_pe != self.my_index {
+                let send_pe = self.arch.single_iter(team_send_pe).next().unwrap();
+                unsafe {
+                    self.barrier_buf[i - 1].put_slice(send_pe, round, barrier_slice);
+                    //safe as we are the only ones writing to our index
+                }
+            }
+        }
+    }
+
+    fn do_recv_round(&self, round: usize, recv_pe_index: usize) -> Option<usize> {
+        for i in recv_pe_index..=self.n {
+            let team_recv_pe = ((self.my_index as isize
+                - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
+                as isize)
+                .rem_euclid(self.num_pes as isize) as isize;
+            let recv_pe = self.arch.single_iter(team_recv_pe as usize).next().unwrap();
+            if team_recv_pe as usize != self.my_index {
+                unsafe {
+                    //safe as  each pe is only capable of writing to its own index
+                    if self.barrier_buf[i - 1]
+                        .as_mut_slice()
+                        .expect("Data should exist on PE")[round]
+                        < self.barrier_id
+                    {
+                        self.lamellae.flush();
+                        return Some(recv_pe);
+                    }
+                }
+            }
+        }
+        None
+    }
+}
+
+impl Future for BarrierHandle {
+    type Output = ();
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        // let mut this = self.project();
+        match self.state {
+            State::RoundInit(round) => {
+                let mut round = round;
+                while round < self.num_rounds {
+                    self.do_send_round(round);
+                    if let Some(recv_pe) = self.do_recv_round(round, 1) {
+                        *self.project().state = State::RoundInProgress(round, recv_pe);
+                        cx.waker().wake_by_ref();
+                        return Poll::Pending;
+                    }
+                    round += 1;
+                }
+                *self.project().state = State::RoundInit(round);
+                Poll::Ready(())
+            }
+            State::RoundInProgress(round, recv_pe) => {
+                let mut round = round;
+                if let Some(recv_pe) = self.do_recv_round(round, recv_pe) {
+                    *self.project().state = State::RoundInProgress(round, recv_pe);
+                    cx.waker().wake_by_ref();
+                    return Poll::Pending;
+                }
+                round += 1;
+                while round < self.num_rounds {
+                    if let Some(recv_pe) = self.do_recv_round(round, 1) {
+                        *self.project().state = State::RoundInProgress(round, recv_pe);
+                        cx.waker().wake_by_ref();
+                        return Poll::Pending;
+                    }
+                    round += 1;
+                }
+                *self.project().state = State::RoundInit(round);
+                Poll::Ready(())
+            }
+        }
+    }
+}
+
+impl LamellarRequest for BarrierHandle {
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::RoundInit(round) => {
+                let mut round = round;
+                while round < self.num_rounds {
+                    self.do_send_round(round);
+                    let mut recv_pe_index = 1;
+                    while let Some(recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                        recv_pe_index = recv_pe;
+                        std::thread::yield_now();
+                    }
+                    round += 1;
+                }
+            }
+            State::RoundInProgress(round, recv_pe) => {
+                let mut round = round;
+                let mut recv_pe_index = recv_pe;
+                while let Some(_recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                    recv_pe_index = recv_pe;
+                    std::thread::yield_now();
+                }
+                round += 1;
+                while round < self.num_rounds {
+                    recv_pe_index = 1;
+                    while let Some(recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                        recv_pe_index = recv_pe;
+                        std::thread::yield_now();
+                    }
+                    round += 1;
+                }
+            }
+        }
+    }
+
+    fn ready_or_set_waker(&mut self, _waker: &Waker) -> bool {
+        match self.state {
+            State::RoundInit(round) => {
+                if round < self.num_rounds {
+                    false
+                } else {
+                    true
+                }
+            }
+            State::RoundInProgress(round, _) => {
+                if round < self.num_rounds {
+                    false
+                } else {
+                    true
+                }
+            }
+        }
+    }
+
+    fn val(&self) -> Self::Output {
+        match self.state {
+            State::RoundInit(round) => {
+                let mut round = round;
+                while round < self.num_rounds {
+                    self.do_send_round(round);
+                    let mut recv_pe_index = 1;
+                    while let Some(recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                        recv_pe_index = recv_pe;
+                        std::thread::yield_now();
+                    }
+                    round += 1;
+                }
+            }
+            State::RoundInProgress(round, recv_pe) => {
+                let mut round = round;
+                let mut recv_pe_index = recv_pe;
+                while let Some(recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                    recv_pe_index = recv_pe;
+                    std::thread::yield_now();
+                }
+                round += 1;
+                while round < self.num_rounds {
+                    recv_pe_index = 1;
+                    while let Some(recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                        recv_pe_index = recv_pe;
+                        std::thread::yield_now();
+                    }
+                    round += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index baa5b98e..2a315ea8 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -720,7 +720,7 @@ pub struct LamellarTeamRT {
     pub(crate) world_counters: Arc<AMCounters>, // can probably remove this?
     pub(crate) id: usize,
     sub_team_id_cnt: AtomicUsize,
-    barrier: Barrier,
+    pub(crate) barrier: Barrier,
     dropped: MemoryRegion<usize>,
     pub(crate) remote_ptr_addr: usize,
     pub(crate) team_hash: u64,
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index cef0d7f1..96f7bfe7 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -7,25 +7,27 @@ use rand::seq::SliceRandom;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         #[allow(unused_unsafe)]
-        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
-        $array.wait_all();
-        $array.barrier();
+        unsafe {
+            $array
+                .dist_iter_mut()
+                .blocking_for_each(move |x| *x = $init_val)
+        };
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
-        $array.barrier();
+        $array
+            .dist_iter()
+            .blocking_for_each(move |x| x.store($init_val));
         // println!("----------------------------------------------");
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
-        $array.barrier();
+        $array
+            .dist_iter_mut()
+            .blocking_for_each(move |x| *x = $init_val);
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
-        $array.barrier();
+        $array
+            .dist_iter_mut()
+            .blocking_for_each(move |x| *x = $init_val);
     };
 }
 
@@ -56,12 +58,12 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
@@ -292,7 +294,7 @@ macro_rules! check_results {
         $array.wait_all();
         $array.barrier();
         #[allow(unused_unsafe)]
-        for (i, elem) in unsafe {onesided_iter!($array_ty,$array).into_iter().enumerate() } {
+        for (i, elem) in unsafe { onesided_iter!($array_ty, $array).into_iter().enumerate() } {
             let val = *elem;
             check_val!($array_ty, val, $num_pes, success);
             if !success {

From e4f700c88a96f20a57b3e98d3adc739ba82d9abf Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 22 Jul 2024 17:02:39 -0700
Subject: [PATCH 052/116] apply same distiter refactoring to localiter

---
 src/array/iterator/distributed_iterator.rs    |   5 +-
 .../distributed_iterator/consumer/collect.rs  |  16 -
 .../distributed_iterator/consumer/reduce.rs   |   1 -
 src/array/iterator/local_iterator.rs          | 778 +++++++++++++-----
 .../local_iterator/consumer/collect.rs        | 245 +++++-
 .../iterator/local_iterator/consumer/count.rs |  96 ++-
 .../local_iterator/consumer/for_each.rs       | 415 ++--------
 .../local_iterator/consumer/reduce.rs         | 135 ++-
 .../iterator/local_iterator/consumer/sum.rs   | 126 ++-
 src/array/unsafe/iteration/distributed.rs     | 243 +-----
 src/array/unsafe/iteration/local.rs           | 349 ++++----
 11 files changed, 1345 insertions(+), 1064 deletions(-)

diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 01caf5cd..8544d140 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -38,15 +38,14 @@ pub(crate) use consumer::*;
 
 use crate::array::iterator::{private::*, Schedule};
 use crate::array::{
-    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, GenericAtomicArray, InnerArray,
-    LamellarArray, NativeAtomicArray,
+    operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray,
+    LamellarArray,
 };
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
 use crate::active_messaging::SyncSend;
 
-use enum_dispatch::enum_dispatch;
 use futures_util::Future;
 use paste::paste;
 use std::marker::PhantomData;
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 203a93fd..4eecdc56 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -156,22 +156,6 @@ where
     }
 }
 
-//#[doc(hidden)]
-// #[pin_project]
-// pub struct InnerDistIterCollectHandle<
-//     T: Dist + ArrayOps,
-//     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
-// > {
-//     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
-//     pub(crate) distribution: Distribution,
-//     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-//     state: InnerState<T, A>,
-// }
-// enum InnerState<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> {
-//     ReqsPending(Vec<(usize, T)>),
-//     Collecting(Pin<Box<dyn Future<Output = A>>>),
-// }
-
 #[pin_project]
 pub(crate) struct InnerDistIterCollectHandle<T, A> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 81650db0..77dbd99c 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -14,7 +14,6 @@ use crate::Dist;
 use futures_util::{ready, Future, StreamExt};
 use pin_project::pin_project;
 use std::collections::VecDeque;
-use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll, Waker};
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index d8dfabe8..e09ba9c7 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -36,7 +36,7 @@ pub(crate) use consumer::*;
 
 use crate::array::iterator::{private::*, Schedule};
 use crate::array::{
-    operations::ArrayOps, AsyncTeamFrom, AtomicArray, Distribution, InnerArray, LamellarArray,
+    operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray, LamellarArray,
 };
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
@@ -48,130 +48,98 @@ use futures_util::Future;
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
+use paste::paste;
+
+macro_rules! consumer_impl {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ; [$(-> $($blocking_ret:tt)*)? ]) => {
+        fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($return_type)*
+        where
+           $($bounds)+
+        {
+            self.as_inner().$name($($arg),*)
+        }
 
-#[doc(hidden)]
-#[enum_dispatch]
-pub trait LocalIteratorLauncher: InnerArray {
-    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        self.as_inner().for_each(iter, op)
-    }
-
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        self.as_inner().for_each_with_schedule(sched, iter, op)
-    }
-
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.as_inner().for_each_async(iter, op)
-    }
-
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.as_inner()
-            .for_each_async_with_schedule(sched, iter, op)
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        self.as_inner().reduce(iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        self.as_inner().reduce_with_schedule(sched, iter, op)
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.as_inner().collect(iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.as_inner().collect_with_schedule(sched, iter, d)
-    }
+        paste! {
+            fn [<$name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            ) ->  $($return_type)*
+            where
+                $($bounds)+
+            {
+                self.as_inner().[<$name _with_schedule>](sched, $($arg),*)
+            }
 
-    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        self.as_inner().count(iter)
-    }
+            fn [<blocking_ $name >]<$($generics),*>(
+                &self,
+                $($arg : $arg_ty),*
+            )   $(-> $($blocking_ret)*)?
+            where
+                $($bounds)+
+            {
+                self.as_inner().[<blocking_ $name >]($($arg),*)
+            }
 
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        self.as_inner().count_with_schedule(sched, iter)
-    }
+            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            )  $(-> $($blocking_ret)*)?
+            where
+                $($bounds)+
+            {
+                self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
+            }
+        }
+    };
+}
 
-    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        self.as_inner().sum(iter)
-    }
+#[doc(hidden)]
+#[enum_dispatch]
+pub trait LocalIteratorLauncher: InnerArray {
 
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        self.as_inner().sum_with_schedule(sched, iter)
-    }
+    consumer_impl!(
+        for_each<I, F>(iter: &I, op: F);
+        [LocalIterForEachHandle];
+        [I: LocalIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
+        []
+    );
+    consumer_impl!(
+        for_each_async<I, F, Fut>(iter: &I, op: F); 
+        [LocalIterForEachHandle];
+        [I: LocalIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
+        []);
+
+    consumer_impl!(
+        reduce<I, F>(iter: &I, op: F); 
+        [LocalIterReduceHandle<I::Item, F>];
+        [I: LocalIterator + 'static, I::Item: SyncSend + Copy, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
+        [-> Option<I::Item>]);
+
+    consumer_impl!(
+        collect<I, A>(iter: &I, d: Distribution); 
+        [LocalIterCollectHandle<I::Item, A>];
+        [I: LocalIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static];
+        [-> A]);
+
+    consumer_impl!(
+        collect_async<I, A, B>(iter: &I, d: Distribution); 
+        [LocalIterCollectHandle<B, A>];
+        [I: LocalIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
+        [-> A]);
+
+    consumer_impl!(
+        count<I>(iter: &I); 
+        [LocalIterCountHandle];
+        [I: LocalIterator + 'static ];
+        [-> usize]);
+
+    consumer_impl!(
+        sum<I>(iter: &I); 
+        [LocalIterSumHandle<I::Item>];
+        [I: LocalIterator + 'static, I::Item: SyncSend +  std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
+        [-> I::Item]);
 
     //#[doc(hidden)]
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
@@ -211,18 +179,18 @@ pub trait LocalIteratorLauncher: InnerArray {
 /// Additonaly functionality can be found in the [IndexedLocalIterator] trait:
 /// these methods are only available for local iterators where the number of elements is known in advance (e.g. after invoking `filter` these methods would be unavailable)
 pub trait LocalIterator: SyncSend + IterClone + 'static {
-    /// The type of item this distributed iterator produces
+    /// The type of item this local iterator produces
     type Item: Send;
 
-    /// The array to which this distributed iterator was created from
+    /// The array to which this local iterator was created from
     type Array: LocalIteratorLauncher;
 
-    /// Internal method used to initalize this distributed iterator to the correct element and correct length.
+    /// Internal method used to initalize this local iterator to the correct element and correct length.
     ///
     /// Because we know the number of elements of the array on each PE we can specify the index to start from.
     fn init(&self, start_i: usize, cnt: usize) -> Self;
 
-    /// Return the original array this distributed iterator belongs too
+    /// Return the original array this local iterator belongs too
     fn array(&self) -> Self::Array;
 
     /// Return the next element in the iterator, otherwise return None
@@ -389,6 +357,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///         .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     /// );
     ///```
+    #[must_use]
     fn for_each<F>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -396,7 +365,33 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each(self, op)
     }
 
-    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed schedule policy.
+     /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
+    ///
+    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    ///
+    /// The iteration will be complete upon return from this function
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// 
+    ///     array
+    ///         .local_iter()
+    ///         .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
+    /// 
+    ///```
+    fn blocking_for_each<F>(&self, op: F)
+    where
+        F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_for_each(self, op)
+    }
+
+    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Scehedule][crate::array::iterator::Schedule] policy.
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
     /// Note calling this function launches the iteration regardless of if the returned future is used or not.
@@ -411,6 +406,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// array.local_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
     /// array.wait_all();
     ///```
+    #[must_use]
     fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -418,6 +414,26 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_with_schedule(sched, self, op)
     }
 
+    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Scehedule][crate::array::iterator::Schedule] policy.
+    ///
+    /// The iteration will be complete upon return from this function
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// array.local_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
+    ///```
+    fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F) 
+    where
+        F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_for_each_with_schedule(sched, self, op)
+    }
+
     /// Calls a closure and immediately awaits the result on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
     ///
     /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
@@ -448,6 +464,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
+    #[must_use]
     fn for_each_async<F, Fut>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -456,7 +473,44 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed schedule policy.
+    /// Calls a closure and immediately awaits the result on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
+    ///
+    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    ///
+    /// The supplied closure must return a future.
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// The iteration will have been completed by the time this function returns
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// array.local_iter().blocking_for_each_async(|elem| async move {
+    ///     async_std::task::yield_now().await;
+    ///     println!("{:?} {elem}",std::thread::current().id())
+    /// });
+    /// ```
+    /// essentially the for_each_async call gets converted into (on each thread)
+    ///```ignore
+    /// for fut in array.iter(){
+    ///     fut.await;
+    /// }
+    ///```
+    fn blocking_for_each_async<F, Fut>(&self, op: F) 
+    where
+        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.array().blocking_for_each_async(self, op)
+    }
+
+
+    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// The supplied closure must return a future.
     ///
@@ -478,6 +532,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// });
     /// array.wait_all();
     ///```
+    #[must_use]
     fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -486,9 +541,37 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async_with_schedule(sched, self, op)
     }
 
+    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    ///
+    /// The supplied closure must return a future.
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// The iteration will have been completed by the time this function returns
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// array.local_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
+    ///     async_std::task::yield_now().await;
+    ///     println!("{:?} {elem}",std::thread::current().id())
+    /// });
+    ///```
+    fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
+    where
+        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+        Fut: Future<Output = ()> + Send + 'static,
+    {
+        self.array().blocking_for_each_async_with_schedule(sched, self, op)
+    }
+
     /// Reduces the elements of the local iterator using the provided closure
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
     ///
     /// # Examples
     ///```
@@ -500,10 +583,11 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn reduce<F>(&self, op: F) -> LocalIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
-        Self::Item: SyncSend,
+        Self::Item: SyncSend + Copy,
         F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
     {
         self.array().reduce(self, op)
@@ -511,7 +595,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
 
     /// Reduces the elements of the local iterator using the provided closure
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// This function returns the reduced value
     ///
     /// # Examples
     ///```
@@ -520,9 +604,32 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.local_iter().reduce(|acc,elem| acc+elem);
+    /// let sum  = array.blocking_local_iter().reduce(|acc,elem| acc+elem);
+    ///```
+    fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
+    where
+        // &'static Self: LocalIterator + 'static,
+        Self::Item: SyncSend + Copy,
+        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_reduce(self, op)
+    }
+
+    /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let req = array.local_iter().reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn reduce_with_schedule<F>(
         &self,
         sched: Schedule,
@@ -530,35 +637,37 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ) -> LocalIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
-        Self::Item: SyncSend,
+        Self::Item: SyncSend + Copy,
         F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
     {
         self.array().reduce_with_schedule(sched, self, op)
     }
 
-    // /// Reduces the elements of the local iterator using the provided closure
-    // ///
-    // /// This function returns a future which needs to be driven to completion to retrieve the new container.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let req = array.local_iter().reduce(|acc,elem| acc+elem);
-    // /// let sum = array.block_on(req); //wait on the collect request to get the new array
-    // ///```
-    // fn reduce_async<F,Fut>(&self, op: F) -> Pin<Box<dyn Future<Output = Self::Item> + Send>>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: SyncSend,
-    //     F: Fn(Self::Item,Self::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = Self::Item> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().reduce_async(self, op)
-    // }
+    /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// This function returns the reduced value
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let sum = array.local_iter().blocking_reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
+    ///```
+    fn blocking_reduce_with_schedule<F>(
+        &self,
+        sched: Schedule,
+        op: F,
+    ) -> Option<Self::Item>
+    where
+        // &'static Self: LocalIterator + 'static,
+        Self::Item: SyncSend + Copy,
+        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_reduce_with_schedule(sched, self, op)
+    }
 
     /// Collects the elements of the local iterator into the specified container type
     ///
@@ -575,6 +684,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
+    #[must_use]
     fn collect<A>(&self, d: Distribution) -> LocalIterCollectHandle<Self::Item, A>
     where
         // &'static Self: LocalIterator + 'static,
@@ -586,6 +696,29 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
 
     /// Collects the elements of the local iterator into the specified container type
     ///
+    /// This function returns the new container
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// let array_clone = array.clone();
+    /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
+    ///```
+    fn blocking_collect<A>(&self, d: Distribution) ->A
+    where
+        // &'static Self: LocalIterator + 'static,
+        Self::Item: Dist + ArrayOps,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect(self, d)
+    }
+
+    /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
+    ///
     /// This function returns a future which needs to be driven to completion to retrieve the new container.
     ///
     /// # Examples
@@ -599,6 +732,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
+    #[must_use]
     fn collect_with_schedule<A>(
         &self,
         sched: Schedule,
@@ -612,36 +746,219 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_with_schedule(sched, self, d)
     }
 
-    // /// Collects the elements of the local iterator into the specified container type
-    // /// Each element from the iterator must return a Future
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// This function returns a future which needs to be driven to completion to retrieve the new container.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let array_clone = array.clone();
-    // /// let req = array.dist_iter().map(|elem|  array_clone.fetch_add(elem,1000)).collect_async::<Vec<usize>>(Distribution::Cyclic);
-    // /// let new_vec = array.block_on(req);
-    // fn collect_async<A, T>(&self, d: Distribution) -> Pin<Box<dyn Future<Output = A> + Send>>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     T: Dist,
-    //     Self::Item: Future<Output = T> + SyncSend + Clone + 'static,
-    //     A: From<UnsafeArray<<Self::Item as Future>::Output>> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().collect_async(self, d)
-    // }
+    /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// This function returns the new container
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// let array_clone = array.clone();
+    /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
+    ///
+    ///``
+    fn blocking_collect_with_schedule<A>(
+        &self,
+        sched: Schedule,
+        d: Distribution,
+    ) -> A
+    where
+        // &'static Self: LocalIterator + 'static,
+        Self::Item: Dist + ArrayOps,
+        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect_with_schedule(sched, self, d)
+    }
+
+    /// Collects the awaited elements of the local iterator into a new LamellarArray
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// Each element from the iterator must return a Future
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve the new LamellarArray.
+    /// Calling await on the future will invoke an implicit barrier (allocating the resources for a new array).
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// // initialize a world and an atomic array
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// // clone the array; this doesn't duplicate the underlying
+    /// // data but it does create a second pointer that we can
+    /// // discard when necessary
+    /// let array_clone = array.clone();
+    ///
+    /// // run collect
+    /// let req
+    ///     = array_clone.local_iter().map(
+    ///         move |elem|
+    ///         array_clone
+    ///             .fetch_add(elem.load(),1000))
+    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    /// let _new_array = array.block_on(req);
+    ///```
+    #[must_use]
+    fn collect_async<A, T>(&self, d: Distribution) -> LocalIterCollectHandle<T, A>
+    where
+        // &'static Self: DistributedIterator + 'static,
+        T: Dist + ArrayOps,
+        Self::Item: Future<Output = T> + Send + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().collect_async(self, d)
+    }
+
+    /// Collects the awaited elements of the local iterator into a new LamellarArray
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// Each element from the iterator must return a Future
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// The function returns the new LamellarArray upon completion.
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// // initialize a world and an atomic array
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// // clone the array; this doesn't duplicate the underlying
+    /// // data but it does create a second pointer that we can
+    /// // discard when necessary
+    /// let array_clone = array.clone();
+    ///
+    /// // run collect
+    /// let _new_array
+    ///     = array_clone.local_iter().map(
+    ///         move |elem|
+    ///         array_clone
+    ///             .fetch_add(elem.load(),1000))
+    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    ///```
+    fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
+    where
+        // &'static Self: DistributedIterator + 'static,
+        T: Dist + ArrayOps,
+        Self::Item: Future<Output = T> + Send + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect_async(self, d)
+    }
+
+    /// Collects the awaited elements of the local iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// Each element from the iterator must return a Future
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve the new LamellarArray.
+    /// Calling await on the future will invoke an implicit barrier (allocating the resources for a new array).
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// // initialize a world and an atomic array
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// // clone the array; this doesn't duplicate the underlying
+    /// // data but it does create a second pointer that we can
+    /// // discard when necessary
+    /// let array_clone = array.clone();
+    ///
+    /// // run collect
+    /// let req
+    ///     = array_clone.local_iter().map(
+    ///         move |elem|
+    ///         array_clone
+    ///             .fetch_add(elem.load(),1000))
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
+    /// let _new_array = array.block_on(req);
+    ///```
+    #[must_use]
+    fn collect_async_with_schedule<A, T>(&self, sched: Schedule,   d: Distribution) -> LocalIterCollectHandle<T, A>
+    where
+        // &'static Self: DistributedIterator + 'static,
+        T: Dist + ArrayOps,
+        Self::Item: Future<Output = T> + Send + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().collect_async_with_schedule(sched, self, d)
+    }
+
+    /// Collects the awaited elements of the local iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    ///
+    /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    ///
+    /// Each element from the iterator must return a Future
+    ///
+    /// Each thread will only drive a single future at a time.
+    ///
+    /// The function returns the new LamellarArray upon completion.
+    ///
+    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    /// distribute data across the PEs as evenly as possible.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// // initialize a world and an atomic array
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    ///
+    /// // clone the array; this doesn't duplicate the underlying
+    /// // data but it does create a second pointer that we can
+    /// // discard when necessary
+    /// let array_clone = array.clone();
+    ///
+    /// // run collect
+    /// let _new_array
+    ///     = array_clone.local_iter().map(
+    ///         move |elem|
+    ///         array_clone
+    ///             .fetch_add(elem.load(),1000))
+    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    ///```
+    fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
+    where
+        // &'static Self: DistributedIterator + 'static,
+        T: Dist + ArrayOps,
+        Self::Item: Future<Output = T> + Send + 'static,
+        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    {
+        self.array().blocking_collect_async_with_schedule(sched,self, d)
+    }
 
     /// Counts the number of the elements of the local iterator
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// This function returns a future which needs to be driven to completion to retrieve the number of elements in the local iterator
     ///
     /// # Examples
     ///```
@@ -653,13 +970,31 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().count();
     /// let cnt = array.block_on(req);
     ///```
+    #[must_use]
     fn count(&self) -> LocalIterCountHandle {
         self.array().count(self)
     }
 
     /// Counts the number of the elements of the local iterator
     ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// This returns the number of elements in the local iterator
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let cnt = array.local_iter().blocking_count();
+    ///```
+    fn blocking_count(&self) -> usize {
+        self.array().blocking_count(self)
+    }
+
+    /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// This function returns a future which needs to be driven to completion to retrieve the number of elements in the local iterator
     ///
     /// # Examples
     ///```
@@ -668,13 +1003,32 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.local_iter().count_with_schedule(Scheduler::Dynamic);
+    /// let req = array.local_iter().count_with_schedule(Schedule::Dynamic);
     /// let cnt = array.block_on(req);
     ///```
+    #[must_use]
     fn count_with_schedule(&self, sched: Schedule) -> LocalIterCountHandle {
         self.array().count_with_schedule(sched, self)
     }
 
+    /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// This returns the number of elements in the local iterator
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let cnt = array.local_iter().blocking_count_with_schedule(Schedule::Dynamic);
+    ///```
+    fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
+        self.array().blocking_count_with_schedule(sched, self)
+    }
+
+
     /// Sums the elements of the local iterator.
     ///
     /// Takes each element, adds them together, and returns the result.
@@ -693,14 +1047,39 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
+    #[must_use]
     fn sum(&self) -> LocalIterSumHandle<Self::Item>
     where
-        Self::Item: SyncSend + std::iter::Sum,
+        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
     {
         self.array().sum(self)
     }
 
-    /// Sums the elements of the local iterator, using the specified schedule
+    /// Sums the elements of the local iterator.
+    ///
+    /// Takes each element, adds them together, and returns the result.
+    ///
+    /// An empty iterator returns the zero value of the type.
+    ///
+    /// This function the sum upon completion.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let req = array.local_iter().blocking_sum();
+    ///```
+    fn blocking_sum(&self) -> Self::Item
+    where
+        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
+    {
+        self.array().blocking_sum(self)
+    }
+
+    /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Takes each element, adds them together, and returns the result.
     ///
@@ -718,12 +1097,37 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req);
     ///```
+    #[must_use]
     fn sum_with_schedule(&self, sched: Schedule) -> LocalIterSumHandle<Self::Item>
     where
-        Self::Item: SyncSend + std::iter::Sum,
+        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
     {
         self.array().sum_with_schedule(sched, self)
     }
+
+    /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
+    ///
+    /// Takes each element, adds them together, and returns the result.
+    ///
+    /// An empty iterator returns the zero value of the type.
+    ///
+    /// This function returns the sum upon completion.
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    ///
+    /// let sum = array.local_iter().blocking_sum_with_schedule(Schedule::Guided);
+    ///```
+    fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
+    where
+        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
+    {
+        self.array().blocking_sum_with_schedule(sched, self)
+    }
 }
 
 /// An interface for dealing with local iterators which are indexable, meaning it returns an iterator of known length
@@ -929,7 +1333,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// let my_pe = world.my_pe();
     ///
     /// //initalize array_B
-    /// array_B.dist_iter_mut().enumerate().for_each(move|(i,elem)| *elem = i);
+    /// array_B.local_iter_mut().enumerate().for_each(move|(i,elem)| *elem = i);
     /// array_B.wait_all();
     ///
     /// array_A.local_iter().zip(array_B.local_iter()).for_each(move|(elem_A,elem_B)| println!("PE: {my_pe} A: {elem_A} B: {elem_B}"));
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 221747d2..0c0e35da 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -3,6 +3,7 @@ use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
 use crate::array::iterator::private::*;
 use crate::array::operations::ArrayOps;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
@@ -43,7 +44,7 @@ where
     type AmOutput = Vec<(usize, I::Item)>;
     type Output = A;
     type Item = (usize, I::Item);
-    type Handle = LocalIterCollectHandle<I::Item, A>;
+    type Handle = InnerLocalIterCollectHandle<I::Item, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Collect {
             iter: self.iter.init(start, cnt),
@@ -66,11 +67,11 @@ where
         team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        LocalIterCollectHandle {
+        InnerLocalIterCollectHandle {
             reqs,
             distribution: self.distribution,
             team,
-            state: State::ReqsPending(Vec::new()),
+            state: InnerState::ReqsPending(Vec::new()),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -78,25 +79,99 @@ where
     }
 }
 
+#[derive(Debug)]
+pub(crate) struct CollectAsync<I, A, B> {
+    pub(crate) iter: Monotonic<I>,
+    pub(crate) distribution: Distribution,
+    pub(crate) _phantom: PhantomData<(A, B)>,
+}
+
+impl<I: IterClone, A, B> IterClone for CollectAsync<I, A, B> {
+    fn iter_clone(&self, _: Sealed) -> Self {
+        CollectAsync {
+            iter: self.iter.iter_clone(Sealed),
+            distribution: self.distribution.clone(),
+            _phantom: self._phantom.clone(),
+        }
+    }
+}
+
+impl<I, A, B> IterConsumer for CollectAsync<I, A, B>
+where
+    I: LocalIterator,
+    I::Item: Future<Output = B> + Send + 'static,
+    B: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+{
+    type AmOutput = Vec<(usize, B)>;
+    type Output = A;
+    type Item = (usize, I::Item);
+    type Handle = InnerLocalIterCollectHandle<B, A>;
+    fn init(&self, start: usize, cnt: usize) -> Self {
+        CollectAsync {
+            iter: self.iter.init(start, cnt),
+            distribution: self.distribution.clone(),
+            _phantom: self._phantom.clone(),
+        }
+    }
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+    fn into_am(&self, schedule: IterSchedule) -> LamellarArcLocalAm {
+        Arc::new(CollectAsyncAm {
+            iter: self.iter_clone(Sealed),
+            schedule,
+        })
+    }
+    fn create_handle(
+        self,
+        team: Pin<Arc<LamellarTeamRT>>,
+        reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
+    ) -> Self::Handle {
+        InnerLocalIterCollectHandle {
+            reqs,
+            distribution: self.distribution,
+            team,
+            state: InnerState::ReqsPending(Vec::new()),
+        }
+    }
+    fn max_elems(&self, in_elems: usize) -> usize {
+        self.iter.elems(in_elems)
+    }
+}
+
+impl<I, A, B> Clone for CollectAsync<I, A, B>
+where
+    I: LocalIterator + Clone,
+    I::Item: Future<Output = B> + Send + 'static,
+    B: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+{
+    fn clone(&self) -> Self {
+        CollectAsync {
+            iter: self.iter.clone(),
+            distribution: self.distribution.clone(),
+            _phantom: self._phantom.clone(),
+        }
+    }
+}
+
 //#[doc(hidden)]
 #[pin_project]
-pub struct LocalIterCollectHandle<
-    T: Dist + ArrayOps,
-    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend,
-> {
+pub(crate) struct InnerLocalIterCollectHandle<T, A> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
     pub(crate) distribution: Distribution,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
-    state: State<T, A>,
+    state: InnerState<T, A>,
 }
 
-enum State<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend> {
+enum InnerState<T, A> {
     ReqsPending(Vec<(usize, T)>),
-    Collecting(Pin<Box<dyn Future<Output = A>>>),
+    Collecting(Pin<Box<dyn Future<Output = A> + Send>>),
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
-    LocalIterCollectHandle<T, A>
+    InnerLocalIterCollectHandle<T, A>
 {
     async fn async_create_array(
         local_vals: Vec<T>,
@@ -113,13 +188,13 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static> Future
-    for LocalIterCollectHandle<T, A>
+    for InnerLocalIterCollectHandle<T, A>
 {
     type Output = A;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(vals) => {
+            InnerState::ReqsPending(vals) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if !req.ready_or_set_waker(cx.waker()) {
                         this.reqs.push_front(req);
@@ -140,12 +215,12 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
                         return Poll::Ready(a);
                     }
                     Poll::Pending => {
-                        *this.state = State::Collecting(collect);
+                        *this.state = InnerState::Collecting(collect);
                         return Poll::Pending;
                     }
                 }
             }
-            State::Collecting(collect) => {
+            InnerState::Collecting(collect) => {
                 let a = ready!(Future::poll(collect.as_mut(), cx));
                 return Poll::Ready(a);
             }
@@ -154,7 +229,7 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
-    LamellarRequest for LocalIterCollectHandle<T, A>
+    LamellarRequest for InnerLocalIterCollectHandle<T, A>
 {
     fn blocking_wait(mut self) -> Self::Output {
         // let mut num_local_vals = 0;
@@ -188,6 +263,91 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
         self.create_array(local_vals)
     }
 }
+#[pin_project]
+pub struct LocalIterCollectHandle<T, A> {
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State<T, A>,
+}
+
+impl<T, A> LocalIterCollectHandle<T, A>
+where
+    T: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
+{
+    pub(crate) fn new(
+        inner: Pin<Box<dyn Future<Output = InnerLocalIterCollectHandle<T, A>> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Init(inner),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State<T, A> {
+    Init(Pin<Box<dyn Future<Output = InnerLocalIterCollectHandle<T, A>> + Send>>),
+    Reqs(#[pin] InnerLocalIterCollectHandle<T, A>),
+}
+impl<T, A> Future for LocalIterCollectHandle<T, A>
+where
+    T: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
+{
+    type Output = A;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Init(inner) => {
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl<T, A> LamellarRequest for LocalIterCollectHandle<T, A>
+where
+    T: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
+{
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Init(_) => {
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Init(_reqs) => {
+                unreachable!("should never be in init state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
 
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct CollectAm<I, A> {
@@ -217,27 +377,32 @@ where
     }
 }
 
-// #[lamellar_impl::AmLocalDataRT(Clone)]
-// pub(crate) struct CollectAsyncAm<I>
-// where
-//     I: LocalIterator,
-// {
-//     pub(crate) iter: I,
-//     pub(crate) schedule: IterSchedule,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I> LamellarAm for CollectAsyncAm<I>
-// where
-//     I: LocalIterator + 'static,
-//     I::Item: Sync,
-// {
-//     async fn exec(&self) -> Vec<(usize,I::Item)> {
-//         let mut iter = self.schedule.monotonic_iter(self.iter.iter_clone(Sealed));
-//         let mut res = vec![];
-//         while let Some(elem) = iter.next(){
-//             res.push(elem.await);
-//         }
-//         res
-//     }
-// }
+#[lamellar_impl::AmLocalDataRT(Clone)]
+pub(crate) struct CollectAsyncAm<I, A, B>
+where
+    I: LocalIterator,
+    I::Item: Future<Output = B> + Send + 'static,
+    B: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+{
+    pub(crate) iter: CollectAsync<I, A, B>,
+    pub(crate) schedule: IterSchedule,
+}
+
+#[lamellar_impl::rt_am_local]
+impl<I, A, B> LamellarAm for CollectAsyncAm<I, A, B>
+where
+    I: LocalIterator,
+    I::Item: Future<Output = B> + Send + 'static,
+    B: Dist + ArrayOps,
+    A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
+{
+    async fn exec(&self) -> Vec<(usize, B)> {
+        let mut iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
+        let mut res = vec![];
+        while let Some((index, elem)) = iter.next() {
+            res.push((index, elem.await));
+        }
+        res
+    }
+}
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index a1f8d191..cde28ba8 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -1,11 +1,12 @@
 use crate::active_messaging::LamellarArcLocalAm;
 use crate::array::iterator::local_iterator::LocalIterator;
 use crate::array::iterator::{consumer::*, private::*};
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use futures_util::Future;
+use futures_util::{ready, Future};
 use pin_project::pin_project;
 use std::collections::VecDeque;
 use std::pin::Pin;
@@ -32,7 +33,7 @@ where
     type AmOutput = usize;
     type Output = usize;
     type Item = I::Item;
-    type Handle = LocalIterCountHandle;
+    type Handle = InnerLocalIterCountHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Count {
             iter: self.iter.init(start, cnt),
@@ -51,10 +52,10 @@ where
         self,
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
-    ) -> LocalIterCountHandle {
-        LocalIterCountHandle {
+    ) -> InnerLocalIterCountHandle {
+        InnerLocalIterCountHandle {
             reqs,
-            state: State::ReqsPending(0),
+            state: InnerState::ReqsPending(0),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -64,21 +65,21 @@ where
 
 //#[doc(hidden)]
 #[pin_project]
-pub struct LocalIterCountHandle {
+pub(crate) struct InnerLocalIterCountHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
-    state: State,
+    state: InnerState,
 }
 
-enum State {
+enum InnerState {
     ReqsPending(usize),
 }
 
-impl Future for LocalIterCountHandle {
+impl Future for InnerLocalIterCountHandle {
     type Output = usize;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(cnt) => {
+            InnerState::ReqsPending(cnt) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if !req.ready_or_set_waker(cx.waker()) {
                         this.reqs.push_front(req);
@@ -93,7 +94,7 @@ impl Future for LocalIterCountHandle {
 }
 
 //#[doc(hidden)]
-impl LamellarRequest for LocalIterCountHandle {
+impl LamellarRequest for InnerLocalIterCountHandle {
     fn blocking_wait(mut self) -> Self::Output {
         self.reqs
             .drain(..)
@@ -119,6 +120,79 @@ impl LamellarRequest for LocalIterCountHandle {
     }
 }
 
+#[pin_project]
+pub struct LocalIterCountHandle {
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State,
+}
+
+impl LocalIterCountHandle {
+    pub(crate) fn new(
+        inner: Pin<Box<dyn Future<Output = InnerLocalIterCountHandle> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Init(inner),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State {
+    Init(Pin<Box<dyn Future<Output = InnerLocalIterCountHandle> + Send>>),
+    Reqs(#[pin] InnerLocalIterCountHandle),
+}
+impl Future for LocalIterCountHandle {
+    type Output = usize;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Init(inner) => {
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl LamellarRequest for LocalIterCountHandle {
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Init(_reqs) => {
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Init(_reqs) => {
+                unreachable!("should never be in init state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct CountAm<I> {
     pub(crate) iter: Count<I>,
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index 013bfab2..3fbfd437 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -2,11 +2,13 @@ use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
 use crate::array::iterator::private::*;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use futures_util::Future;
+use futures_util::{ready, Future};
+use pin_project::pin_project;
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
@@ -43,7 +45,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
-    type Handle = LocalIterForEachHandle;
+    type Handle = InnerLocalIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         // println!("ForEach before init start {:?} cnt {:?}", start,cnt);
         let iter = ForEach {
@@ -68,7 +70,7 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        LocalIterForEachHandle { reqs }
+        InnerLocalIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -110,7 +112,7 @@ where
     type AmOutput = ();
     type Output = ();
     type Item = I::Item;
-    type Handle = LocalIterForEachHandle;
+    type Handle = InnerLocalIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEachAsync {
             iter: self.iter.init(start, cnt),
@@ -133,7 +135,7 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        LocalIterForEachHandle { reqs }
+        InnerLocalIterForEachHandle { reqs }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -155,11 +157,11 @@ where
 }
 
 //#[doc(hidden)]
-pub struct LocalIterForEachHandle {
+pub(crate) struct InnerLocalIterForEachHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
 }
 
-impl Future for LocalIterForEachHandle {
+impl Future for InnerLocalIterForEachHandle {
     type Output = ();
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         while let Some(mut req) = self.reqs.pop_front() {
@@ -173,7 +175,7 @@ impl Future for LocalIterForEachHandle {
 }
 
 //#[doc(hidden)]
-impl LamellarRequest for LocalIterForEachHandle {
+impl LamellarRequest for InnerLocalIterForEachHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(..) {
             req.blocking_wait();
@@ -195,6 +197,86 @@ impl LamellarRequest for LocalIterForEachHandle {
     }
 }
 
+//#[doc(hidden)]
+#[pin_project]
+pub struct LocalIterForEachHandle {
+    // pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State,
+}
+
+impl LocalIterForEachHandle {
+    pub(crate) fn new(
+        reqs: Pin<Box<dyn Future<Output = InnerLocalIterForEachHandle> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        LocalIterForEachHandle {
+            team: array.data.team.clone(),
+            state: State::Init(reqs),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State {
+    Init(Pin<Box<dyn Future<Output = InnerLocalIterForEachHandle> + Send>>),
+    Reqs(#[pin] InnerLocalIterForEachHandle),
+}
+impl Future for LocalIterForEachHandle {
+    type Output = ();
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Init(inner) => {
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(()) => Poll::Ready(()),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                ready!(inner.poll(cx));
+                Poll::Ready(())
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl LamellarRequest for LocalIterForEachHandle {
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Init(reqs) => {
+                self.team.block_on(reqs).blocking_wait();
+            }
+            State::Reqs(inner) => {
+                inner.blocking_wait();
+            }
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Init(_reqs) => {
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Init(_reqs) => {
+                unreachable!("should never be in barrier state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct ForEachAm<I, F>
 where
@@ -248,320 +330,3 @@ where
         }
     }
 }
-
-// #[lamellar_impl::AmLocalDataRT(Clone)]
-// pub(crate) struct ForEachStatic<I, F>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) start_i: usize,
-//     pub(crate) end_i: usize,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachStatic<I, F>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         let mut iter = self.data.init(self.start_i, self.end_i - self.start_i);
-//         // println!("for each static thread {:?} {} {} {}",std::thread::current().id(),self.start_i, self.end_i, self.end_i - self.start_i);
-//         // let mut cnt = 0;
-//         while let Some(elem) = iter.next() {
-//             (&self.op)(elem);
-//             // cnt += 1;
-//         }
-//         // println!("thread {:?} elems processed {:?}",std::thread::current().id(), cnt);
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachDynamic<I, F>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) cur_i: Arc<AtomicUsize>,
-//     pub(crate) max_i: usize,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachDynamic<I, F>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-
-//         while cur_i < self.max_i {
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(cur_i, 1);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item);
-//             }
-//             cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachChunk<I, F>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) ranges: Vec<(usize, usize)>,
-//     pub(crate) range_i: Arc<AtomicUsize>,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachChunk<I, F>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         while range_i < self.ranges.len() {
-//             let (start_i, end_i) = self.ranges[range_i];
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(start_i, end_i - start_i);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item);
-//             }
-//             range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachWorkStealing<I, F>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item),
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) range: IterWorkStealer,
-//     // pub(crate) ranges: Vec<(usize, usize)>,
-//     // pub(crate) range_i: Arc<AtomicUsize>,
-//     pub(crate) siblings: Vec<IterWorkStealer>,
-// }
-// #[lamellar_impl::rt_am_local]
-// impl<I, F> LamellarAm for ForEachWorkStealing<I, F>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) + SyncSend + 'static,
-// {
-//     async fn exec(&self) {
-//         let (start, end) = *self.range.range.lock();
-//         // println!("{:?} ForEachWorkStealing {:?} {:?}",std::thread::current().id(), start, end);
-//         let mut iter = self.data.init(start, end - start);
-//         while self.range.next().is_some() {
-//             if let Some(elem) = iter.next() {
-//                 (&self.op)(elem);
-//             } else {
-//                 self.range.set_done();
-//             }
-//         }
-//         // println!("{:?} ForEachWorkStealing done with my range",std::thread::current().id());
-//         let mut rng = thread_rng();
-//         let mut workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//         workers.shuffle(&mut rng);
-//         while let Some(worker) = workers.pop() {
-//             // println!("{:?} ForEachWorkStealing stealing from sibling",std::thread::current().id());
-//             if let Some((start, end)) = self.siblings[worker].steal() {
-//                 let mut iter = self.data.init(start, end - start);
-//                 self.range.set_range(start, end);
-//                 while self.range.next().is_some() {
-//                     if let Some(elem) = iter.next() {
-//                         (&self.op)(elem);
-//                     } else {
-//                         self.range.set_done();
-//                     }
-//                 }
-//                 workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//                 workers.shuffle(&mut rng);
-//             }
-//         }
-//         // println!("{:?} ForEachWorkStealing done",std::thread::current().id());
-//     }
-// }
-
-//-------------------------async for each-------------------------------
-
-// #[lamellar_impl::AmLocalDataRT(Clone)]
-// pub(crate) struct ForEachAsyncStatic<I, F, Fut>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) start_i: usize,
-//     pub(crate) end_i: usize,
-// }
-
-// impl<I, F, Fut> std::fmt::Debug for ForEachAsyncStatic<I, F, Fut>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//         write!(
-//             f,
-//             "ForEachAsync {{   start_i: {:?}, end_i: {:?} }}",
-//             self.start_i, self.end_i
-//         )
-//     }
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncStatic<I, F, Fut>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         let mut iter = self.data.init(self.start_i, self.end_i - self.start_i);
-//         while let Some(elem) = iter.next() {
-//             (&self.op)(elem).await;
-//         }
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachAsyncDynamic<I, F, Fut>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) cur_i: Arc<AtomicUsize>,
-//     pub(crate) max_i: usize,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncDynamic<I, F, Fut>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-
-//         while cur_i < self.max_i {
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(cur_i, 1);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item).await;
-//             }
-//             cur_i = self.cur_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachAsyncChunk<I, F, Fut>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) ranges: Vec<(usize, usize)>,
-//     pub(crate) range_i: Arc<AtomicUsize>,
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncChunk<I, F, Fut>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let mut range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         while range_i < self.ranges.len() {
-//             let (start_i, end_i) = self.ranges[range_i];
-//             // println!("in for each {:?} {:?} {:?}", range_i, start_i, end_i);
-//             let mut iter = self.data.init(start_i, end_i - start_i);
-//             while let Some(item) = iter.next() {
-//                 (self.op)(item).await;
-//             }
-//             range_i = self.range_i.fetch_add(1, Ordering::Relaxed);
-//         }
-//         // println!("done in for each");
-//     }
-// }
-
-// #[lamellar_impl::AmLocalDataRT(Clone, Debug)]
-// pub(crate) struct ForEachAsyncWorkStealing<I, F, Fut>
-// where
-//     I: LocalIterator,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone,
-//     Fut: Future<Output = ()> + Send,
-// {
-//     pub(crate) op: F,
-//     pub(crate) data: I,
-//     pub(crate) range: IterWorkStealer,
-//     pub(crate) siblings: Vec<IterWorkStealer>,
-// }
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ForEachAsyncWorkStealing<I, F, Fut>
-// where
-//     I: LocalIterator + 'static,
-//     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = ()> + Send + 'static,
-// {
-//     async fn exec(&self) {
-//         // println!("in for each {:?} {:?}", self.start_i, self.end_i);
-//         let (start, end) = *self.range.range.lock();
-//         let mut iter = self.data.init(start, end - start);
-//         while self.range.next().is_some() {
-//             if let Some(elem) = iter.next() {
-//                 (&self.op)(elem);
-//             }
-//         }
-//         // let mut rng = thread_rng().gen();
-//         let mut workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//         workers.shuffle(&mut thread_rng());
-//         while let Some(worker) = workers.pop() {
-//             if let Some((start, end)) = self.siblings[worker].steal() {
-//                 let mut iter = self.data.init(start, end - start);
-//                 self.range.set_range(start, end);
-//                 while self.range.next().is_some() {
-//                     if let Some(elem) = iter.next() {
-//                         (&self.op)(elem).await;
-//                     }
-//                 }
-//                 workers = (0..self.siblings.len()).collect::<Vec<usize>>();
-//                 workers.shuffle(&mut thread_rng());
-//             }
-//         }
-//         // println!("done in for each");
-//     }
-// }
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index 305921bf..dd44db36 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -2,11 +2,12 @@ use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
 use crate::array::iterator::private::*;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use futures_util::Future;
+use futures_util::{ready, Future};
 use pin_project::pin_project;
 use std::collections::VecDeque;
 use std::pin::Pin;
@@ -31,13 +32,13 @@ impl<I: IterClone, F: Clone> IterClone for Reduce<I, F> {
 impl<I, F> IterConsumer for Reduce<I, F>
 where
     I: LocalIterator + 'static,
-    I::Item: SyncSend,
+    I::Item: SyncSend + Copy,
     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
 {
     type AmOutput = Option<I::Item>;
     type Output = Option<I::Item>;
     type Item = I::Item;
-    type Handle = LocalIterReduceHandle<I::Item, F>;
+    type Handle = InnerLocalIterReduceHandle<I::Item, F>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Reduce {
             iter: self.iter.init(start, cnt),
@@ -59,10 +60,10 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        LocalIterReduceHandle {
+        InnerLocalIterReduceHandle {
             op: self.op,
             reqs,
-            state: State::ReqsPending(None),
+            state: InnerState::ReqsPending(None),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -72,17 +73,17 @@ where
 
 //#[doc(hidden)]
 #[pin_project]
-pub struct LocalIterReduceHandle<T, F> {
+pub(crate) struct InnerLocalIterReduceHandle<T, F> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Option<T>>>,
     pub(crate) op: F,
-    state: State<T>,
+    state: InnerState<T>,
 }
 
-enum State<T> {
+enum InnerState<T> {
     ReqsPending(Option<T>),
 }
 
-impl<T, F> Future for LocalIterReduceHandle<T, F>
+impl<T, F> Future for InnerLocalIterReduceHandle<T, F>
 where
     T: SyncSend + Copy + 'static,
     F: Fn(T, T) -> T + SyncSend + 'static,
@@ -91,7 +92,7 @@ where
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(val) => {
+            InnerState::ReqsPending(val) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if !req.ready_or_set_waker(cx.waker()) {
                         this.reqs.push_front(req);
@@ -113,7 +114,7 @@ where
 }
 
 //#[doc(hidden)]
-impl<T, F> LamellarRequest for LocalIterReduceHandle<T, F>
+impl<T, F> LamellarRequest for InnerLocalIterReduceHandle<T, F>
 where
     T: SyncSend + Copy + 'static,
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
@@ -142,6 +143,88 @@ where
     }
 }
 
+#[pin_project]
+pub struct LocalIterReduceHandle<T, F> {
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State<T, F>,
+}
+
+impl<T, F> LocalIterReduceHandle<T, F> {
+    pub(crate) fn new(
+        reqs: Pin<Box<dyn Future<Output = InnerLocalIterReduceHandle<T, F>> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Init(reqs),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State<T, F> {
+    Init(Pin<Box<dyn Future<Output = InnerLocalIterReduceHandle<T, F>> + Send>>),
+    Reqs(#[pin] InnerLocalIterReduceHandle<T, F>),
+}
+impl<T, F> Future for LocalIterReduceHandle<T, F>
+where
+    T: SyncSend + Copy + 'static,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
+    type Output = Option<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Init(inner) => {
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl<T, F> LamellarRequest for LocalIterReduceHandle<T, F>
+where
+    T: SyncSend + Copy + 'static,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Init(_) => {
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Init(_reqs) => {
+                unreachable!("should never be in init state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct ReduceAm<I, F> {
     pub(crate) op: F,
@@ -163,7 +246,7 @@ impl<I: IterClone, F: Clone> IterClone for ReduceAm<I, F> {
 impl<I, F> LamellarAm for ReduceAm<I, F>
 where
     I: LocalIterator + 'static,
-    I::Item: SyncSend,
+    I::Item: SyncSend + Copy,
     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
 {
     async fn exec(&self) -> Option<I::Item> {
@@ -179,31 +262,3 @@ where
         }
     }
 }
-
-// #[lamellar_impl::AmLocalDataRT(Clone)]
-// pub(crate) struct ReduceAsyncAm<I, F, Fut> {
-//     pub(crate) op: F,
-//     pub(crate) iter: I,
-//     pub(crate) schedule: IterSchedule,
-//     pub(crate) _phantom: PhantomData<Fut>
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, F, Fut> LamellarAm for ReduceAsyncAm<I, F, Fut>
-// where
-//     I: LocalIterator + 'static,
-//     I::Item: SyncSend,
-//     F: Fn(I::Item, I::Item) -> Fut + SyncSend + Clone + 'static,
-//     Fut: Future<Output = I::Item> + SyncSend + Clone + 'static,
-// {
-//     async fn exec(&self) -> Option<I::Item> {
-//         let mut iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
-//         let mut accum = iter.next();
-//         while let Some(elem) = iter.next() {
-//             accum = Some((self.op)(accum.unwrap(), elem).await);
-//             // cnt += 1;
-//         }
-//         accum
-//         // println!("thread {:?} elems processed {:?}",std::thread::current().id(), cnt);
-//     }
-// }
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index d1ca2bbe..d85d924c 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -2,11 +2,12 @@ use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
 use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
 use crate::array::iterator::private::*;
+use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 
-use futures_util::Future;
+use futures_util::{ready, Future};
 use pin_project::pin_project;
 use std::collections::VecDeque;
 use std::pin::Pin;
@@ -34,7 +35,7 @@ where
     type AmOutput = I::Item;
     type Output = I::Item;
     type Item = I::Item;
-    type Handle = LocalIterSumHandle<I::Item>;
+    type Handle = InnerLocalIterSumHandle<I::Item>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Sum {
             iter: self.iter.init(start, cnt),
@@ -54,9 +55,9 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        LocalIterSumHandle {
+        InnerLocalIterSumHandle {
             reqs,
-            state: State::ReqsPending(None),
+            state: InnerState::ReqsPending(None),
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -66,16 +67,16 @@ where
 
 //#[doc(hidden)]
 #[pin_project]
-pub struct LocalIterSumHandle<T> {
+pub(crate) struct InnerLocalIterSumHandle<T> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
-    state: State<T>,
+    state: InnerState<T>,
 }
 
-enum State<T> {
+enum InnerState<T> {
     ReqsPending(Option<T>),
 }
 
-impl<T> Future for LocalIterSumHandle<T>
+impl<T> Future for InnerLocalIterSumHandle<T>
 where
     T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
 {
@@ -83,7 +84,7 @@ where
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
         match &mut this.state {
-            State::ReqsPending(local_sum) => {
+            InnerState::ReqsPending(local_sum) => {
                 while let Some(mut req) = this.reqs.pop_front() {
                     if !req.ready_or_set_waker(cx.waker()) {
                         this.reqs.push_front(req);
@@ -106,7 +107,7 @@ where
 }
 
 //#[doc(hidden)]
-impl<T> LamellarRequest for LocalIterSumHandle<T>
+impl<T> LamellarRequest for InnerLocalIterSumHandle<T>
 where
     T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
 {
@@ -132,6 +133,86 @@ where
     }
 }
 
+#[pin_project]
+pub struct LocalIterSumHandle<T> {
+    team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    state: State<T>,
+}
+
+impl<T> LocalIterSumHandle<T> {
+    pub(crate) fn new(
+        inner: Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>,
+        array: &UnsafeArrayInner,
+    ) -> Self {
+        Self {
+            team: array.data.team.clone(),
+            state: State::Init(inner),
+        }
+    }
+}
+
+#[pin_project(project = StateProj)]
+enum State<T> {
+    Init(Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>),
+    Reqs(#[pin] InnerLocalIterSumHandle<T>),
+}
+impl<T> Future for LocalIterSumHandle<T>
+where
+    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+{
+    type Output = T;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Init(inner) => {
+                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                match Pin::new(&mut inner).poll(cx) {
+                    Poll::Ready(val) => Poll::Ready(val),
+                    Poll::Pending => {
+                        *this.state = State::Reqs(inner);
+                        Poll::Pending
+                    }
+                }
+            }
+            StateProj::Reqs(inner) => {
+                let val = ready!(inner.poll(cx));
+                Poll::Ready(val)
+            }
+        }
+    }
+}
+
+//#[doc(hidden)]
+impl<T> LamellarRequest for LocalIterSumHandle<T>
+where
+    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+{
+    fn blocking_wait(self) -> Self::Output {
+        match self.state {
+            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
+            State::Reqs(inner) => inner.blocking_wait(),
+        }
+    }
+    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        match &mut self.state {
+            State::Init(_) => {
+                waker.wake_by_ref();
+                false
+            }
+            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+        }
+    }
+    fn val(&self) -> Self::Output {
+        match &self.state {
+            State::Init(_reqs) => {
+                unreachable!("should never be in init state when val is called");
+            }
+            State::Reqs(inner) => inner.val(),
+        }
+    }
+}
+
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct SumAm<I> {
     pub(crate) iter: Sum<I>,
@@ -158,28 +239,3 @@ where
         iter.sum::<I::Item>()
     }
 }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I> LamellarAm for Sum<I>
-// where
-//     I: LocalIterator + 'static,
-//     I::Item: std::iter::Sum + SyncSend,
-// {
-//     async fn exec(&self) -> I::Item {
-//         let mut iter = self.iter.init(self.start_i, self.end_i - self.start_i);
-//         // println!("for each static thread {:?} {} {} {}",std::thread::current().id(),self.start_i, self.end_i, self.end_i - self.start_i);
-//         // let mut cnt = 0;
-//         let mut sum;
-//         if let Some(elem) = iter.next() {
-//             sum = elem;
-//             while let Some(elem) = iter.next() {
-//                 sum += elem;
-//             }
-//         }
-//         else {
-//             // sum = I::Item::default();
-//         }
-//         sum
-//         // println!("thread {:?} elems processed {:?}",std::thread::current().id(), cnt);
-//     }
-// }
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 3cae74c5..4e8e06f8 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -4,6 +4,7 @@ use crate::array::iterator::private::Sealed;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, InnerArray};
 use crate::lamellar_request::LamellarRequest;
+use crate::env_var::config;
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -79,6 +80,20 @@ macro_rules! consumer_impl {
             where
                 $($bounds)+
             {
+                if std::thread::current().id() != *crate::MAIN_THREAD {
+                    let name = stringify!{$name};
+                    let msg = format!("
+                        [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
+                        Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                    );
+                    if let Some(val) = config().blocking_call_warning {
+                        if val {
+                            println!("{msg}");
+                        }
+                    } else {
+                        println!("{msg}");
+                    }
+                }
                 let am = $($am)*;
                 self.data.team.barrier.tasking_barrier();
                 let inner = self.clone();
@@ -139,72 +154,6 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         []
     );
 
-    // fn for_each<I, F>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    // where
-    //     I: DistributedIterator + 'static,
-    //     F: Fn(I::Item) + SyncSend + Clone + 'static,
-    // {
-    //     self.for_each_with_schedule(Schedule::Static, iter, op)
-    // }
-
-    // fn for_each_with_schedule<I, F>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     op: F,
-    // ) -> DistIterForEachHandle
-    // where
-    //     I: DistributedIterator + 'static,
-    //     F: Fn(I::Item) + SyncSend + Clone + 'static,
-    // {
-    //     let for_each = ForEach {
-    //         iter: iter.iter_clone(Sealed),
-    //         op,
-    //     };
-    //     self.team().barrier();
-    //     match sched {
-    //         Schedule::Static => self.sched_static(for_each),
-    //         Schedule::Dynamic => self.sched_dynamic(for_each),
-    //         Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-    //         Schedule::Guided => self.sched_guided(for_each),
-    //         Schedule::WorkStealing => self.sched_work_stealing(for_each),
-    //     }
-    // }
-
-    // fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> DistIterForEachHandle
-    // where
-    //     I: DistributedIterator + 'static,
-    //     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     self.for_each_async_with_schedule(Schedule::Static, iter, op)
-    // }
-
-    // fn for_each_async_with_schedule<I, F, Fut>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     op: F,
-    // ) -> DistIterForEachHandle
-    // where
-    //     I: DistributedIterator + 'static,
-    //     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     let for_each = ForEachAsync {
-    //         iter: iter.iter_clone(Sealed),
-    //         op,
-    //     };
-    //     self.team().barrier();
-    //     match sched {
-    //         Schedule::Static => self.sched_static(for_each),
-    //         Schedule::Dynamic => self.sched_dynamic(for_each),
-    //         Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-    //         Schedule::Guided => self.sched_guided(for_each),
-    //         Schedule::WorkStealing => self.sched_work_stealing(for_each),
-    //     }
-    // }
-
     consumer_impl!(
         reduce<I, F>( iter: &I, op: F); 
         [DistIterReduceHandle<I::Item, F>];
@@ -217,52 +166,6 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         ];
         [-> Option<I::Item>]);
 
-    // consumer_impl!(
-    //     reduce_async<I, T,F>( iter: &I, op: F); 
-    //     [DistIterReduceHandle<T, F>];
-    //     [I: DistributedIterator + 'static, I::Item: Future<Output = T> + Send + 'static, T: Dist + Send + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,];
-    //     [
-    //         ReduceAsync {
-    //             iter: iter.iter_clone(Sealed),
-    //             op,
-    //             // _phantom: PhantomData,
-    //         }
-    //     ];
-    //     [-> Option<T>]);
-
-    // fn reduce<I, F>(&self, iter: &I, op: F) -> DistIterReduceHandle<I::Item, F>
-    // where
-    //     I: DistributedIterator + 'static,
-    //     I::Item: Dist + ArrayOps,
-    //     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    // {
-    //     self.reduce_with_schedule(Schedule::Static, iter, op)
-    // }
-
-    // fn reduce_with_schedule<I, F>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     op: F,
-    // ) -> DistIterReduceHandle<I::Item, F>
-    // where
-    //     I: DistributedIterator + 'static,
-    //     I::Item: Dist + ArrayOps,
-    //     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    // {
-    //     let reduce = Reduce {
-    //         iter: iter.iter_clone(Sealed),
-    //         op,
-    //     };
-    //     match sched {
-    //         Schedule::Static => self.sched_static(reduce),
-    //         Schedule::Dynamic => self.sched_dynamic(reduce),
-    //         Schedule::Chunk(size) => self.sched_chunk(reduce, size),
-    //         Schedule::Guided => self.sched_guided(reduce),
-    //         Schedule::WorkStealing => self.sched_work_stealing(reduce),
-    //     }
-    // }
-
     consumer_impl!(
         collect<I, A>( iter: &I, d: Distribution); 
         [DistIterCollectHandle<I::Item, A>];
@@ -288,75 +191,6 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         ];
         [-> A]);
 
-    // fn collect<I, A>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<I::Item, A>
-    // where
-    //     I: DistributedIterator + 'static,
-    //     I::Item: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.collect_with_schedule(Schedule::Static, iter, d)
-    // }
-
-    // fn collect_with_schedule<I, A>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> DistIterCollectHandle<I::Item, A>
-    // where
-    //     I: DistributedIterator + 'static,
-    //     I::Item: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     let collect = Collect {
-    //         iter: iter.iter_clone(Sealed).monotonic(),
-    //         distribution: d,
-    //         _phantom: PhantomData,
-    //     };
-    //     match sched {
-    //         Schedule::Static => self.sched_static(collect),
-    //         Schedule::Dynamic => self.sched_dynamic(collect),
-    //         Schedule::Chunk(size) => self.sched_chunk(collect, size),
-    //         Schedule::Guided => self.sched_guided(collect),
-    //         Schedule::WorkStealing => self.sched_work_stealing(collect),
-    //     }
-    // }
-
-    // fn collect_async<I, A, B>(&self, iter: &I, d: Distribution) -> DistIterCollectHandle<B, A>
-    // where
-    //     I: DistributedIterator,
-    //     I::Item: Future<Output = B> + Send + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.collect_async_with_schedule(Schedule::Static, iter, d)
-    // }
-
-    // fn collect_async_with_schedule<I, A, B>(
-    //     &self,
-    //     sched: Schedule,
-    //     iter: &I,
-    //     d: Distribution,
-    // ) -> DistIterCollectHandle<B, A>
-    // where
-    //     I: DistributedIterator,
-    //     I::Item: Future<Output = B> + Send + 'static,
-    //     B: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     let collect = CollectAsync {
-    //         iter: iter.iter_clone(Sealed).monotonic(),
-    //         distribution: d,
-    //         _phantom: PhantomData,
-    //     };
-    //     match sched {
-    //         Schedule::Static => self.sched_static(collect),
-    //         Schedule::Dynamic => self.sched_dynamic(collect),
-    //         Schedule::Chunk(size) => self.sched_chunk(collect, size),
-    //         Schedule::Guided => self.sched_guided(collect),
-    //         Schedule::WorkStealing => self.sched_work_stealing(collect),
-    //     }
-    // }
     consumer_impl!(
         count<I>( iter: &I); 
         [DistIterCountHandle];
@@ -367,28 +201,6 @@ impl DistIteratorLauncher for UnsafeArrayInner {
             }
         ];
         [-> usize]);
-    // fn count<I>(&self, iter: &I) -> DistIterCountHandle
-    // where
-    //     I: DistributedIterator + 'static,
-    // {
-    //     self.count_with_schedule(Schedule::Static, iter)
-    // }
-
-    // fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterCountHandle
-    // where
-    //     I: DistributedIterator + 'static,
-    // {
-    //     let count = Count {
-    //         iter: iter.iter_clone(Sealed),
-    //     };
-    //     match sched {
-    //         Schedule::Static => self.sched_static(count),
-    //         Schedule::Dynamic => self.sched_dynamic(count),
-    //         Schedule::Chunk(size) => self.sched_chunk(count, size),
-    //         Schedule::Guided => self.sched_guided(count),
-    //         Schedule::WorkStealing => self.sched_work_stealing(count),
-    //     }
-    // }
 
     consumer_impl!(
         sum<I>(iter: &I); 
@@ -401,31 +213,6 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         ];
         [-> I::Item]);
 
-    // fn sum<I>(&self, iter: &I) -> DistIterSumHandle<I::Item>
-    // where
-    //     I: DistributedIterator + 'static,
-    //     I::Item: Dist + ArrayOps + std::iter::Sum,
-    // {
-    //     self.sum_with_schedule(Schedule::Static, iter)
-    // }
-
-    // fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> DistIterSumHandle<I::Item>
-    // where
-    //     I: DistributedIterator + 'static,
-    //     I::Item: Dist + ArrayOps + std::iter::Sum,
-    // {
-    //     let sum = Sum {
-    //         iter: iter.iter_clone(Sealed),
-    //     };
-    //     match sched {
-    //         Schedule::Static => self.sched_static(sum),
-    //         Schedule::Dynamic => self.sched_dynamic(sum),
-    //         Schedule::Chunk(size) => self.sched_chunk(sum, size),
-    //         Schedule::Guided => self.sched_guided(sum),
-    //         Schedule::WorkStealing => self.sched_work_stealing(sum),
-    //     }
-    // }
-
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()
     }
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 15b88162..8e5ed072 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -3,6 +3,8 @@ use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::private::*;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
+use crate::lamellar_request::LamellarRequest;
+use crate::env_var::config;
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -12,9 +14,88 @@ use core::marker::PhantomData;
 use futures_util::Future;
 use std::pin::Pin;
 use std::sync::Arc;
+use paste::paste;
 
 impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {}
 
+macro_rules! consumer_impl {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [$(-> $($blocking_ret:tt)*)?] ) => {
+        paste! {
+            fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $return_type$(<$($ret_gen),*>)?
+            where
+            $($bounds)+
+            {
+
+                self.[<$name _with_schedule>](Schedule::Static, $($arg),*)
+            }
+
+
+            fn [<$name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            ) ->   $return_type$(<$($ret_gen),*>)?
+            where
+                $($bounds)+
+            {
+                let am = $($am)*;
+                let inner = self.clone();
+                let reqs_future = Box::pin(async move{match sched {
+                    Schedule::Static => inner.sched_static(am),
+                    Schedule::Dynamic => inner.sched_dynamic(am),
+                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
+                    Schedule::Guided => inner.sched_guided(am),
+                    Schedule::WorkStealing => inner.sched_work_stealing(am),
+                }});
+                $return_type::new(reqs_future,self)
+            }
+
+            fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) $(-> $($blocking_ret)*)?
+            where
+            $($bounds)+
+            {
+
+                self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
+            }
+
+
+            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+                &self,
+                sched: Schedule,
+                $($arg : $arg_ty),*
+            ) $(-> $($blocking_ret)*)?
+            where
+                $($bounds)+
+            {
+                if std::thread::current().id() != *crate::MAIN_THREAD {
+                    let name = stringify!{$name};
+                    let msg = format!("
+                        [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
+                        Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                    );
+                    if let Some(val) = config().blocking_call_warning {
+                        if val {
+                            println!("{msg}");
+                        }
+                    } else {
+                        println!("{msg}");
+                    }
+                }
+                let am = $($am)*;
+                let inner = self.clone();
+                let reqs = match sched {
+                    Schedule::Static => inner.sched_static(am),
+                    Schedule::Dynamic => inner.sched_dynamic(am),
+                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
+                    Schedule::Guided => inner.sched_guided(am),
+                    Schedule::WorkStealing => inner.sched_work_stealing(am),
+                };
+                reqs.blocking_wait()
+            }
+        }
+    };
+}
+
 impl LocalIteratorLauncher for UnsafeArrayInner {
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
         // println!("global index cs:{:?}",chunk_size);
@@ -33,184 +114,96 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
         }
     }
 
-    fn for_each<I, F>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        self.for_each_with_schedule(Schedule::Static, iter, op)
-    }
-
-    fn for_each_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) + SyncSend + Clone + 'static,
-    {
-        let for_each = ForEach {
-            iter: iter.iter_clone(Sealed),
-            op,
-        };
-        match sched {
-            Schedule::Static => self.sched_static(for_each),
-            Schedule::Dynamic => self.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-            Schedule::Guided => self.sched_guided(for_each),
-            Schedule::WorkStealing => self.sched_work_stealing(for_each),
-        }
-    }
-
-    fn for_each_async<I, F, Fut>(&self, iter: &I, op: F) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.for_each_async_with_schedule(Schedule::Static, iter, op)
-    }
-
-    fn for_each_async_with_schedule<I, F, Fut>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterForEachHandle
-    where
-        I: LocalIterator + 'static,
-        F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        let for_each = ForEachAsync {
-            iter: iter.iter_clone(Sealed),
-            op: op.clone(),
-        };
-        match sched {
-            Schedule::Static => self.sched_static(for_each),
-            Schedule::Dynamic => self.sched_dynamic(for_each),
-            Schedule::Chunk(size) => self.sched_chunk(for_each, size),
-            Schedule::Guided => self.sched_guided(for_each),
-            Schedule::WorkStealing => self.sched_work_stealing(for_each),
-        }
-    }
-
-    fn reduce<I, F>(&self, iter: &I, op: F) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        self.reduce_with_schedule(Schedule::Static, iter, op)
-    }
-
-    fn reduce_with_schedule<I, F>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        op: F,
-    ) -> LocalIterReduceHandle<I::Item, F>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend,
-        F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-    {
-        let reduce = Reduce {
-            iter: iter.iter_clone(Sealed),
-            op,
-        };
-        match sched {
-            Schedule::Static => self.sched_static(reduce),
-            Schedule::Dynamic => self.sched_dynamic(reduce),
-            Schedule::Chunk(size) => self.sched_chunk(reduce, size),
-            Schedule::Guided => self.sched_guided(reduce),
-            Schedule::WorkStealing => self.sched_work_stealing(reduce),
-        }
-    }
-
-    fn collect<I, A>(&self, iter: &I, d: Distribution) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.collect_with_schedule(Schedule::Static, iter, d)
-    }
-
-    fn collect_with_schedule<I, A>(
-        &self,
-        sched: Schedule,
-        iter: &I,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<I::Item, A>
-    where
-        I: LocalIterator + 'static,
-        I::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        let collect = Collect {
-            iter: iter.iter_clone(Sealed).monotonic(),
-            distribution: d,
-            _phantom: PhantomData,
-        };
-        match sched {
-            Schedule::Static => self.sched_static(collect),
-            Schedule::Dynamic => self.sched_dynamic(collect),
-            Schedule::Chunk(size) => self.sched_chunk(collect, size),
-            Schedule::Guided => self.sched_guided(collect),
-            Schedule::WorkStealing => self.sched_work_stealing(collect),
-        }
-    }
-
-    fn count<I>(&self, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        self.count_with_schedule(Schedule::Static, iter)
-    }
-
-    fn count_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterCountHandle
-    where
-        I: LocalIterator + 'static,
-    {
-        let count = Count {
-            iter: iter.iter_clone(Sealed),
-        };
-        match sched {
-            Schedule::Static => self.sched_static(count),
-            Schedule::Dynamic => self.sched_dynamic(count),
-            Schedule::Chunk(size) => self.sched_chunk(count, size),
-            Schedule::Guided => self.sched_guided(count),
-            Schedule::WorkStealing => self.sched_work_stealing(count),
-        }
-    }
-
-    fn sum<I>(&self, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        self.sum_with_schedule(Schedule::Static, iter)
-    }
-
-    fn sum_with_schedule<I>(&self, sched: Schedule, iter: &I) -> LocalIterSumHandle<I::Item>
-    where
-        I: LocalIterator + 'static,
-        I::Item: SyncSend + std::iter::Sum,
-    {
-        let sum = Sum {
-            iter: iter.iter_clone(Sealed),
-        };
-        match sched {
-            Schedule::Static => self.sched_static(sum),
-            Schedule::Dynamic => self.sched_dynamic(sum),
-            Schedule::Chunk(size) => self.sched_chunk(sum, size),
-            Schedule::Guided => self.sched_guided(sum),
-            Schedule::WorkStealing => self.sched_work_stealing(sum),
-        }
-    }
+    consumer_impl!(
+        for_each<I, F>(iter: &I, op: F);
+        [LocalIterForEachHandle];
+        [I: LocalIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
+        [
+            ForEach {
+                iter: iter.iter_clone(Sealed),
+                op,
+            }
+        ];
+        []
+    );
+
+    consumer_impl!(
+        for_each_async<I, F, Fut>(iter: &I, op: F);
+        [LocalIterForEachHandle];
+        [I: LocalIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
+        [
+            ForEachAsync {
+                iter: iter.iter_clone(Sealed),
+                op,
+            }
+        ];
+        []
+    );
+
+
+    consumer_impl!(
+        reduce<I, F>( iter: &I, op: F); 
+        [LocalIterReduceHandle<I::Item, F>];
+        [I: LocalIterator + 'static, I::Item: SyncSend + Copy, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
+        [
+            Reduce {
+                iter: iter.iter_clone(Sealed),
+                op,
+            }
+        ];
+        [-> Option<I::Item>]
+    );
+
+    consumer_impl!(
+        collect<I, A>( iter: &I, d: Distribution); 
+        [LocalIterCollectHandle<I::Item, A>];
+        [I: LocalIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
+        [
+            Collect {
+                iter: iter.iter_clone(Sealed).monotonic(),
+                distribution: d,
+                _phantom: PhantomData,
+            }
+        ];
+        [-> A]
+    );
+
+    consumer_impl!(
+        collect_async<I, A, B>( iter: &I, d: Distribution); 
+        [LocalIterCollectHandle<B, A>];
+        [I: LocalIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
+        [
+            CollectAsync {
+                iter: iter.iter_clone(Sealed).monotonic(),
+                distribution: d,
+                _phantom: PhantomData,
+            }
+        ];
+        [-> A]
+    );
+
+    consumer_impl!(
+        count<I>( iter: &I); 
+        [LocalIterCountHandle];
+        [I: LocalIterator + 'static ];
+        [
+            Count {
+                iter: iter.iter_clone(Sealed),
+            }
+        ];
+        [-> usize]
+    );
+
+    consumer_impl!(
+        sum<I>(iter: &I); 
+        [LocalIterSumHandle<I::Item>];
+        [I: LocalIterator + 'static, I::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
+        [
+            Sum {
+                iter: iter.iter_clone(Sealed),
+            }
+        ];
+        [-> I::Item]);
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()

From 7f04aa571d71939b7952ea5420e5af6a20a56699 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 24 Jul 2024 11:25:52 -0700
Subject: [PATCH 053/116] updating examples for "blocking" array iteration API

---
 .../array_consumer_schedules.rs               |  11 +-
 examples/array_examples/array_ops.rs          |  56 ++---
 examples/array_examples/array_put_get.rs      |   2 +-
 .../array_examples/atomic_compare_exchange.rs |  36 ++--
 examples/array_examples/dist_array_reduce.rs  |  25 +--
 .../array_examples/distributed_iteration.rs   |  34 ---
 examples/array_examples/global_lock_array.rs  |  19 +-
 examples/array_examples/local_iteration.rs    | 111 ++--------
 examples/bandwidths/atomic_array_get_bw.rs    |   4 +-
 examples/bandwidths/atomic_array_put_bw.rs    |   7 +-
 .../global_lock_atomic_array_get_bw.rs        |   4 +-
 .../global_lock_atomic_array_put_bw.rs        |   8 +-
 .../local_lock_atomic_array_get_bw.rs         |   4 +-
 .../local_lock_atomic_array_put_bw.rs         |   8 +-
 examples/bandwidths/readonly_array_get_bw.rs  |   4 +-
 .../readonly_array_get_unchecked_bw.rs        |   4 +-
 examples/bandwidths/unsafe_array_get_bw.rs    |   4 +-
 .../unsafe_array_get_unchecked_bw.rs          |   4 +-
 examples/bandwidths/unsafe_array_store_bw.rs  |   4 +-
 .../hello_world_array_iteration.rs            |   7 +-
 examples/kernels/dft_proxy.rs                 | 194 +++++++++--------
 examples/kernels/parallel_array_gemm.rs       |  31 +--
 .../kernels/parallel_blocked_array_gemm.rs    | 203 +++++++++---------
 .../safe_parallel_blocked_array_gemm.rs       | 154 +++++++------
 examples/kernels/serial_array_gemm.rs         |  30 +--
 25 files changed, 434 insertions(+), 534 deletions(-)

diff --git a/examples/array_examples/array_consumer_schedules.rs b/examples/array_examples/array_consumer_schedules.rs
index ae94fa65..6f8ff658 100644
--- a/examples/array_examples/array_consumer_schedules.rs
+++ b/examples/array_examples/array_consumer_schedules.rs
@@ -14,14 +14,13 @@ fn for_each_with_schedule(
 ) {
     let timer = Instant::now();
     let tc = thread_cnts.clone();
-    let _ = array
+    array
         .local_iter()
         .filter(|e| e.load() % 2 == 0)
-        .for_each_with_schedule(schedule, move |e| {
+        .blocking_for_each_with_schedule(schedule, move |e| {
             std::thread::sleep(Duration::from_millis((e.load() * 1) as u64));
             *tc.lock().entry(std::thread::current().id()).or_insert(0) += 1;
         });
-    array.wait_all();
     array.barrier();
     println!("elapsed time {:?}", timer.elapsed().as_secs_f64());
     println!("counts {:?}", thread_cnts.lock());
@@ -96,7 +95,7 @@ fn sum_with_schedule(
     let result = array.block_on(
         array
             .local_iter()
-            .map(|e| e.load() )
+            .map(|e| e.load())
             .filter(|e| e % 2 == 0)
             .sum_with_schedule(schedule),
     );
@@ -110,10 +109,10 @@ fn main() {
     let _my_pe = world.my_pe();
     let _num_pes = world.num_pes();
     let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    let _ = block_array
+    block_array
         .dist_iter_mut()
         .enumerate()
-        .for_each(move |(i, e)| e.store(i));
+        .blocking_for_each(move |(i, e)| e.store(i));
     world.wait_all();
     block_array.print();
 
diff --git a/examples/array_examples/array_ops.rs b/examples/array_examples/array_ops.rs
index f5932a5a..98a8f760 100644
--- a/examples/array_examples/array_ops.rs
+++ b/examples/array_examples/array_ops.rs
@@ -84,9 +84,9 @@ fn test_add<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     add_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -121,9 +121,9 @@ fn test_sub<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     sub_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -152,9 +152,9 @@ fn test_mul<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     mul_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -183,9 +183,9 @@ fn test_div<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     div_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -214,9 +214,9 @@ fn test_rem<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     init_val: T,
     rem_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -245,9 +245,9 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
     init_val: T,
     and_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -259,9 +259,9 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
     array.barrier();
     array.print();
     array.barrier();
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     let mut reqs = vec![];
@@ -281,9 +281,9 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     init_val: T,
     or_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -295,9 +295,9 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.barrier();
     array.print();
     array.barrier();
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     let mut reqs = vec![];
@@ -317,9 +317,9 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     init_val: T,
     xor_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -331,9 +331,9 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.barrier();
     array.print();
     array.barrier();
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     let mut reqs = vec![];
@@ -355,9 +355,9 @@ fn test_store_load<T: std::fmt::Debug + ElementOps + 'static>(
     my_pe: usize,
     num_pes: usize,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -387,9 +387,9 @@ fn test_shl<T: std::fmt::Debug + ElementShiftOps + 'static>(
     init_val: T,
     shl_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
@@ -418,9 +418,9 @@ fn test_shr<T: std::fmt::Debug + ElementShiftOps + 'static>(
     init_val: T,
     shr_val: T,
 ) {
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| elem.store(init_val));
+        .blocking_for_each(move |elem| elem.store(init_val));
     array.wait_all();
     array.barrier();
     array.print();
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index 2160a0cd..11fa7694 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -2,7 +2,7 @@ use lamellar::array::prelude::*;
 use lamellar::memregion::prelude::*;
 
 fn initialize_array(array: &UnsafeArray<usize>) {
-    let _ = unsafe { array.dist_iter_mut().for_each(|x| *x = 0) };
+    unsafe { array.dist_iter_mut().blocking_for_each(|x| *x = 0) };
     array.wait_all();
     array.barrier();
 }
diff --git a/examples/array_examples/atomic_compare_exchange.rs b/examples/array_examples/atomic_compare_exchange.rs
index 94612ad1..c8804116 100644
--- a/examples/array_examples/atomic_compare_exchange.rs
+++ b/examples/array_examples/atomic_compare_exchange.rs
@@ -24,7 +24,7 @@ fn main() {
     let my_pe = world.my_pe();
 
     let array = AtomicArray::<usize>::new(world.team(), num_pes * 2, Distribution::Block);
-    let _ = array.dist_iter_mut().for_each(|x| x.store(0)); //initialize array -- use atomic store
+    array.dist_iter_mut().blocking_for_each(|x| x.store(0)); //initialize array -- use atomic store
     array.wait_all();
     array.barrier();
 
@@ -46,7 +46,7 @@ fn main() {
     array.print();
 
     let array_2 = AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic);
-    let _ = array_2.dist_iter_mut().for_each(|x| x.store(0.0));
+    array_2.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
     array_2.wait_all();
     array_2.barrier();
 
@@ -82,23 +82,25 @@ fn main() {
         }
     });
 
-    let l = array.dist_iter().enumerate().for_each_async(move |(i, e)| {
-        let a2c = array_2.clone();
-        async move {
-            let res = a2c
-                .compare_exchange_epsilon(i, e.load() as f32, 0.0, epsilon)
-                .await;
-            match res {
-                Ok(_) => {
-                    println!("success");
-                }
-                Err(_) => {
-                    println!("failed");
+    array
+        .dist_iter()
+        .enumerate()
+        .blocking_for_each_async(move |(i, e)| {
+            let a2c = array_2.clone();
+            async move {
+                let res = a2c
+                    .compare_exchange_epsilon(i, e.load() as f32, 0.0, epsilon)
+                    .await;
+                match res {
+                    Ok(_) => {
+                        println!("success");
+                    }
+                    Err(_) => {
+                        println!("failed");
+                    }
                 }
             }
-        }
-    });
-    world.block_on(l);
+        });
     println!("num_failed {num_failed} num_ok {num_ok}");
     // array2.print();
 }
diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index 470208b9..d2fb8e6a 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -132,32 +132,23 @@ fn main() {
         "cyclic_sum {:?} cyclic time {:?}, block_sum {:?} block time {:?}",
         cyclic_sum, cyclic_dist_time, block_sum, block_dist_time
     );
-    // for i in 0..total_len {
-    //     block_array.add(i, 10);
-    // }
-    // block_array.for_each_mut(|x| *x += *x);
 
-    world.block_on(unsafe { cyclic_array.dist_iter_mut().for_each(|x| *x += *x) });
-    world.block_on(unsafe {
+    unsafe { cyclic_array.dist_iter_mut().blocking_for_each(|x| *x += *x) };
+    unsafe {
         cyclic_array
             .dist_iter()
             .enumerate()
-            .for_each(|x| println!("x: {:?}", x))
-    });
-
-    // cyclic_array.dist_iter().for_each(|x| println!("x: {:?}", x));
+            .blocking_for_each(|x| println!("x: {:?}", x));
+    }
 
-    world.block_on(unsafe {
+    unsafe {
         block_array
             .dist_iter()
             .enumerate()
-            .for_each(|x| println!("x: {:?}", x))
-    });
+            .blocking_for_each(|x| println!("x: {:?}", x))
+    };
     let block_array = block_array.into_read_only();
-    let _ = block_array.sum();
-    // block_array.dist_iter().for_each(|x| println!("x: {:?}", x));
-    // block_array.for_each(|x| println!("x: {:?}", x));
-    // cyclic_array.for_each_mut(|x| *x += *x);
+    let _ = block_array.blocking_sum();
 
     let one_elem_array = UnsafeArray::<usize>::new(world.team(), 1, Distribution::Block);
     let min = unsafe { one_elem_array.min() };
diff --git a/examples/array_examples/distributed_iteration.rs b/examples/array_examples/distributed_iteration.rs
index 65fb6af1..86fea09c 100644
--- a/examples/array_examples/distributed_iteration.rs
+++ b/examples/array_examples/distributed_iteration.rs
@@ -63,23 +63,6 @@ fn main() {
             )
         });
 
-    // println!("zip ");
-    // block_array
-    //     .dist_iter()
-    //     .zip(cyclic_array.dist_iter())
-    //     .skip(2)
-    //     .enumerate()
-    //     .chunks(4)
-    //     .step_by(3)
-    //     .for_each(move |chunk| {
-    //         println!("[pe({:?})-{:?}]", my_pe, std::thread::current().id(),);
-    //         for (i, elem) in chunk {
-    //             println!("i: {:?} {:?}", i, elem)
-    //         }
-    //     });
-    // block_array.wait_all();
-    // block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("cyclic skip enumerate");
 
@@ -99,23 +82,6 @@ fn main() {
 
     println!("--------------------------------------------------------");
 
-    // block_array
-    //     .dist_iter()
-    //     .chunks(7)
-    //     .enumerate()
-    //     .for_each(move |(i, chunk)| {
-    //         let data = chunk.collect::<Vec<_>>();
-    //         println!(
-    //             "[pe({:?})-{:?}] chunk {:?} {:?}",
-    //             my_pe,
-    //             std::thread::current().id(),
-    //             i,
-    //             data
-    //         )
-    //     });
-    // block_array.wait_all();
-    // block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("cyclic enumerate map async for each");
     cyclic_array.print();
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 6521432d..9d0fef63 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -47,14 +47,17 @@ fn main() {
     array.print();
     println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
-    let task = array.blocking_read_lock().dist_iter().enumerate().for_each(move |(i, elem)| {
-        println!(
-            "{my_pe}, {:?}: {i} {:?}",
-            std::thread::current().id(),
-            *elem
-        )
-    });
-    world.block_on(task);
+    array
+        .blocking_read_lock()
+        .dist_iter()
+        .enumerate()
+        .blocking_for_each(move |(i, elem)| {
+            println!(
+                "{my_pe}, {:?}: {i} {:?}",
+                std::thread::current().id(),
+                *elem
+            )
+        });
     world.barrier();
 
     let task = array
diff --git a/examples/array_examples/local_iteration.rs b/examples/array_examples/local_iteration.rs
index a65df59a..5be210ad 100644
--- a/examples/array_examples/local_iteration.rs
+++ b/examples/array_examples/local_iteration.rs
@@ -21,16 +21,10 @@ fn main() {
     // we currently provide the "for_each" driver which will execute a closure on every element in the distributed array (concurrently)
 
     //for example lets initialize our arrays, where we store the value of my_pe to each local element a pe owns
-    let _ = block_local_iter
+    block_local_iter
         .enumerate()
-        .for_each(move |(i, elem)| elem.store(i));
-    let _ = cyclic_local_iter.for_each(move |elem| elem.store(my_pe));
-    //for_each is asynchronous so we must wait on the array for the operations to complete
-    // we are working on providing a request handle which can be used to check for completion
-    block_array.wait_all();
-    block_array.barrier();
-    cyclic_array.wait_all();
-    cyclic_array.barrier();
+        .blocking_for_each(move |(i, elem)| elem.store(i));
+    cyclic_local_iter.blocking_for_each(move |elem| elem.store(my_pe));
 
     // let block_array = block_array.into_read_only();
     block_array.print();
@@ -41,12 +35,12 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate step_by");
-    let _ = block_array
+    block_array
         .local_iter()
         .skip(2)
         .enumerate()
         .step_by(3)
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -55,34 +49,16 @@ fn main() {
                 elem
             )
         });
-    block_array.wait_all();
     block_array.barrier();
 
-    // println!("zip ");
-    // block_array
-    //     .local_iter()
-    //     .zip(cyclic_array.local_iter())
-    //     .skip(2)
-    //     .enumerate()
-    //     .chunks(4)
-    //     .step_by(3)
-    //     .for_each(move |chunk| {
-    //         println!("[pe({:?})-{:?}]", my_pe, std::thread::current().id(),);
-    //         for (i, elem) in chunk {
-    //             println!("i: {:?} {:?}", i, elem)
-    //         }
-    //     });
-    // block_array.wait_all();
-    // block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("cyclic skip enumerate");
 
-    let _ = cyclic_array
+    cyclic_array
         .local_iter()
         .enumerate()
         .skip(2)
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -91,33 +67,15 @@ fn main() {
                 elem
             )
         });
-    cyclic_array.wait_all();
     cyclic_array.barrier();
 
     println!("--------------------------------------------------------");
 
-    // block_array
-    //     .local_iter()
-    //     .chunks(7)
-    //     .enumerate()
-    //     .for_each(move |(i, chunk)| {
-    //         let data = chunk.collect::<Vec<_>>();
-    //         println!(
-    //             "[pe({:?})-{:?}] chunk {:?} {:?}",
-    //             my_pe,
-    //             std::thread::current().id(),
-    //             i,
-    //             data
-    //         )
-    //     });
-    // block_array.wait_all();
-    // block_array.barrier();
-
     println!("--------------------------------------------------------");
     println!("cyclic enumerate map async for each");
     cyclic_array.print();
     let barray = block_array.clone();
-    let _ = cyclic_array
+    cyclic_array
         .local_iter()
         .enumerate()
         .map(move |(i, elem)| {
@@ -131,7 +89,7 @@ fn main() {
             );
             async move { (i, elem.load(), barray.load(i).await + elem.load()) }
         })
-        .for_each_async(move |i| async move {
+        .blocking_for_each_async(move |i| async move {
             println!(
                 "[pe({:?})-{:?}] {:?}",
                 my_pe,
@@ -139,33 +97,12 @@ fn main() {
                 i.await
             );
         });
-    cyclic_array.wait_all();
     cyclic_array.barrier();
     block_array.print();
 
-    // println!("--------------------------------------------------------");
-    // println!("cyclic enumerate map async collect");
-    // let barray = block_array.clone();
-    // let new_array = world.block_on(
-    //     cyclic_array
-    //         .local_iter()
-    //         .enumerate()
-    //         .map(move |(i, elem)| {
-    //             let barray = barray.clone();
-    //             async move {
-    //                 barray.add(i, *elem).await;
-    //                 barray.fetch_sub(i, *elem).await
-    //             }
-    //         })
-    //         .collect_async::<ReadOnlyArray<usize>, _>(Distribution::Block),
-    // );
-    // cyclic_array.barrier();
-    // new_array.print();
-    // block_array.print();
-
     println!("--------------------------------------------------------");
     println!("block enumerate filter");
-    let _ = block_array
+    block_array
         .local_iter()
         .enumerate()
         .filter(|(_, elem)| {
@@ -177,7 +114,7 @@ fn main() {
             );
             elem.load() % 4 == 0
         })
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -186,12 +123,11 @@ fn main() {
                 elem
             )
         });
-    block_array.wait_all();
     block_array.barrier();
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter_map");
-    let _ = block_array
+    block_array
         .local_iter()
         .enumerate()
         .filter_map(|(i, elem)| {
@@ -201,7 +137,7 @@ fn main() {
                 None
             }
         })
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -210,7 +146,6 @@ fn main() {
                 elem
             )
         });
-    block_array.wait_all();
     block_array.barrier();
     // println!("--------------------------------------------------------");
     // println!("filter_map collect");
@@ -226,11 +161,11 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block skip enumerate");
-    let _ = block_array
+    block_array
         .local_iter()
         .skip(10)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -240,17 +175,16 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
     block_array.barrier();
 
     println!("--------------------------------------------------------");
     println!("block skip  step_by enumerate");
-    let _ = block_array
+    block_array
         .local_iter()
         .skip(10)
         .step_by(3)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -260,17 +194,16 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
     block_array.barrier();
 
     println!("--------------------------------------------------------");
     println!("block take skip enumerate");
-    let _ = block_array
+    block_array
         .local_iter()
         .take(60)
         .skip(10)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -280,18 +213,17 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
     block_array.barrier();
 
     println!("--------------------------------------------------------");
     println!("block take skip take enumerate");
-    let _ = block_array
+    block_array
         .local_iter()
         .take(60)
         .skip(10)
         .take(30)
         .enumerate()
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -301,7 +233,6 @@ fn main() {
             )
         });
 
-    block_array.wait_all();
     block_array.barrier();
 
     println!("--------------------------------------------------------");
diff --git a/examples/bandwidths/atomic_array_get_bw.rs b/examples/bandwidths/atomic_array_get_bw.rs
index 9d607594..8d26bc4e 100644
--- a/examples/bandwidths/atomic_array_get_bw.rs
+++ b/examples/bandwidths/atomic_array_get_bw.rs
@@ -20,9 +20,9 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    let _ = array
+    array
         .local_iter_mut()
-        .for_each(move |elem| *elem = num_pes as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, so use locallock for initializiation
+        .blocking_for_each(move |elem| *elem = num_pes as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, so use locallock for initializiation
     let array = array.into_atomic(); //this enforces a wait_all and barrier
                                      // array.wait_all();
                                      // array.barrier();
diff --git a/examples/bandwidths/atomic_array_put_bw.rs b/examples/bandwidths/atomic_array_put_bw.rs
index f621abb1..f05a24b3 100644
--- a/examples/bandwidths/atomic_array_put_bw.rs
+++ b/examples/bandwidths/atomic_array_put_bw.rs
@@ -20,9 +20,9 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| *elem = 255 as u8); //this is can be pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, local lock tends to perform better
+        .blocking_for_each(move |elem| *elem = 255 as u8); //this is can be pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, local lock tends to perform better
     let mut array = array.into_atomic(); //so we simply convert the LocalLockArray array to atomic after initalization
 
     world.barrier();
@@ -103,7 +103,8 @@ fn main() {
         //     }
         // };
         let temp = array.into_local_lock();
-        let _ = temp.dist_iter_mut().for_each(move |elem| *elem = 255 as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements
+        temp.dist_iter_mut()
+            .blocking_for_each(move |elem| *elem = 255 as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements
         array = temp.into_atomic();
         world.barrier();
     }
diff --git a/examples/bandwidths/global_lock_atomic_array_get_bw.rs b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
index 496be0ef..9f1d1231 100644
--- a/examples/bandwidths/global_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
@@ -24,9 +24,9 @@ fn main() {
         //     *i = num_pes as u8;
         // }
     }
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| *elem = num_pes as u8);
+        .blocking_for_each(move |elem| *elem = num_pes as u8);
     array.wait_all();
     array.barrier();
 
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index 2c072a7e..e7f33500 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -21,9 +21,9 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| *elem = 255 as u8);
+        .blocking_for_each(move |elem| *elem = 255 as u8);
     array.wait_all();
     array.barrier();
 
@@ -96,9 +96,9 @@ fn main() {
         );
         }
         bws.push((sum as f64 / 1048576.0) / cur_t);
-        let _ = array
+        array
             .dist_iter_mut()
-            .for_each(move |elem| *elem = 255 as u8);
+            .blocking_for_each(move |elem| *elem = 255 as u8);
         array.wait_all();
         array.barrier();
     }
diff --git a/examples/bandwidths/local_lock_atomic_array_get_bw.rs b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
index 94686541..c160c541 100644
--- a/examples/bandwidths/local_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
@@ -24,9 +24,9 @@ fn main() {
         //     *i = num_pes as u8;
         // }
     }
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| *elem = num_pes as u8);
+        .blocking_for_each(move |elem| *elem = num_pes as u8);
     array.wait_all();
     array.barrier();
 
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index fe4861f9..ce376976 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -21,9 +21,9 @@ fn main() {
             *i = my_pe as u8;
         }
     }
-    let _ = array
+    array
         .dist_iter_mut()
-        .for_each(move |elem| *elem = 255 as u8);
+        .blocking_for_each(move |elem| *elem = 255 as u8);
     array.wait_all();
     array.barrier();
 
@@ -96,9 +96,9 @@ fn main() {
         );
         }
         bws.push((sum as f64 / 1048576.0) / cur_t);
-        let _ = array
+        array
             .dist_iter_mut()
-            .for_each(move |elem| *elem = 255 as u8);
+            .blocking_for_each(move |elem| *elem = 255 as u8);
         array.wait_all();
         array.barrier();
     }
diff --git a/examples/bandwidths/readonly_array_get_bw.rs b/examples/bandwidths/readonly_array_get_bw.rs
index e3c6d53a..adeaecbc 100644
--- a/examples/bandwidths/readonly_array_get_bw.rs
+++ b/examples/bandwidths/readonly_array_get_bw.rs
@@ -19,9 +19,9 @@ fn main() {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
         }
-        let _ = array
+        array
             .dist_iter_mut()
-            .for_each(move |elem| *elem = num_pes as u8);
+            .blocking_for_each(move |elem| *elem = num_pes as u8);
     }
 
     array.wait_all();
diff --git a/examples/bandwidths/readonly_array_get_unchecked_bw.rs b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
index dbc52559..3163808a 100644
--- a/examples/bandwidths/readonly_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
@@ -23,9 +23,9 @@ fn main() {
         // }
     }
     unsafe {
-        let _ = array
+        array
             .local_iter_mut()
-            .for_each(move |elem| *elem = num_pes as u8);
+            .blocking_for_each(move |elem| *elem = num_pes as u8);
     }
     array.wait_all();
     array.barrier();
diff --git a/examples/bandwidths/unsafe_array_get_bw.rs b/examples/bandwidths/unsafe_array_get_bw.rs
index 3de3afa4..0cf35d0d 100644
--- a/examples/bandwidths/unsafe_array_get_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_bw.rs
@@ -20,9 +20,9 @@ fn main() {
             *i = my_pe as u8;
         }
 
-        let _ = array
+        array
             .local_iter_mut()
-            .for_each(move |elem| *elem = num_pes as u8);
+            .blocking_for_each(move |elem| *elem = num_pes as u8);
     }
     array.wait_all();
     array.barrier();
diff --git a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
index 7985560b..d64c7999 100644
--- a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
@@ -18,9 +18,9 @@ fn main() {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
         }
-        let _ = array
+        array
             .dist_iter_mut()
-            .for_each(move |elem| *elem = num_pes as u8);
+            .blocking_for_each(move |elem| *elem = num_pes as u8);
     }
 
     array.wait_all();
diff --git a/examples/bandwidths/unsafe_array_store_bw.rs b/examples/bandwidths/unsafe_array_store_bw.rs
index ca6e84c4..65aec25c 100644
--- a/examples/bandwidths/unsafe_array_store_bw.rs
+++ b/examples/bandwidths/unsafe_array_store_bw.rs
@@ -19,9 +19,9 @@ fn main() {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
         }
-        let _ = array
+        array
             .dist_iter_mut()
-            .for_each(move |elem| *elem = num_pes as u8);
+            .blocking_for_each(move |elem| *elem = num_pes as u8);
     }
     array.wait_all();
     array.barrier();
diff --git a/examples/hello_world/hello_world_array_iteration.rs b/examples/hello_world/hello_world_array_iteration.rs
index 2cc4d97a..80eff712 100644
--- a/examples/hello_world/hello_world_array_iteration.rs
+++ b/examples/hello_world/hello_world_array_iteration.rs
@@ -25,10 +25,10 @@ fn main() {
     world.barrier(); //wait for PE 0 to finish printing
 
     //initialize array, each PE will set its local elements equal to its ID
-    let request = array
+    array
         .dist_iter_mut() //create a mutable distributed iterator (i.e. data parallel iteration, similar to Rayon par_iter())
         .enumerate() // enumeration with respect to the global array
-        .for_each(move |(i, elem)| {
+        .blocking_for_each(move |(i, elem)| {
             println!(
                 "PE {:?} setting  array[{:?}]={:?} using thread {:?}",
                 my_pe,
@@ -39,9 +39,6 @@ fn main() {
             elem.store(my_pe); //"store" because this is an AtomicArray
         });
 
-    //wait for local iteration to complete
-    world.block_on(request);
-
     //wait for all pes to finish
     world.barrier();
 
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index ac761824..02711900 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -335,10 +335,10 @@ fn dft_lamellar_array(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f64>) -> f
     let timer = Instant::now();
     let signal_clone = signal.clone();
     unsafe {
-        let _ = spectrum
+        spectrum
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(k, spec_bin)| {
+            .blocking_for_each(move |(k, spec_bin)| {
                 let mut sum = 0f64;
                 for (i, &x) in signal_clone
                     .buffered_onesided_iter(1000)
@@ -353,7 +353,6 @@ fn dft_lamellar_array(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f64>) -> f
                 *spec_bin = sum
             });
     }
-    spectrum.wait_all();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -369,7 +368,7 @@ fn dft_lamellar_array_2(signal: ReadOnlyArray<f64>, spectrum: AtomicArray<f64>)
     let _ = spectrum
         .dist_iter_mut()
         .enumerate()
-        .for_each(move |(k, spec_bin)| {
+        .blocking_for_each(move |(k, spec_bin)| {
             let mut sum = 0f64;
             for (i, &x) in signal_clone
                 .buffered_onesided_iter(1000)
@@ -383,7 +382,6 @@ fn dft_lamellar_array_2(signal: ReadOnlyArray<f64>, spectrum: AtomicArray<f64>)
             }
             spec_bin.store(sum);
         });
-    spectrum.wait_all();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -393,22 +391,25 @@ fn dft_lamellar_array_swapped(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f6
     let timer = Instant::now();
     let signal_len = signal.len();
 
+    let mut reqs = vec![];
     unsafe {
         for (i, x) in signal.onesided_iter().into_iter().enumerate() {
             let x = (*x).clone();
-            let _ = spectrum
-                .dist_iter_mut()
-                .enumerate()
-                .for_each(move |(k, spec_bin)| {
-                    let angle =
-                        -1f64 * (i * k) as f64 * 2f64 * std::f64::consts::PI / signal_len as f64;
-                    let twiddle = angle * (angle.cos() + angle * angle.sin());
-                    let _lock = LOCK.lock();
-                    *spec_bin += twiddle * x;
-                });
+            reqs.push(
+                spectrum
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each(move |(k, spec_bin)| {
+                        let angle = -1f64 * (i * k) as f64 * 2f64 * std::f64::consts::PI
+                            / signal_len as f64;
+                        let twiddle = angle * (angle.cos() + angle * angle.sin());
+                        let _lock = LOCK.lock();
+                        *spec_bin += twiddle * x;
+                    }),
+            );
         }
-    }
-    spectrum.wait_all();
+    };
+    spectrum.block_on_all(reqs);
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -423,6 +424,7 @@ fn dft_lamellar_array_opt(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
+    let mut reqs = vec![];
     unsafe {
         signal
             .onesided_iter()
@@ -432,28 +434,30 @@ fn dft_lamellar_array_opt(
             .enumerate()
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
-                let _ = spectrum
-                    .dist_iter_mut()
-                    .enumerate()
-                    .for_each(move |(k, spec_bin)| {
-                        let mut sum = 0f64;
-                        for (j, &x) in signal
-                            .iter()
-                            .enumerate()
-                            .map(|(j, x)| (j + i * buf_size, x))
-                        {
-                            let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
-                                / sig_len as f64;
-                            let twiddle = angle * (angle.cos() + angle * angle.sin());
-                            sum = sum + twiddle * x;
-                        }
-
-                        // let _lock = LOCK.lock();
-                        *spec_bin += sum;
-                    });
+                reqs.push(
+                    spectrum
+                        .dist_iter_mut()
+                        .enumerate()
+                        .for_each(move |(k, spec_bin)| {
+                            let mut sum = 0f64;
+                            for (j, &x) in signal
+                                .iter()
+                                .enumerate()
+                                .map(|(j, x)| (j + i * buf_size, x))
+                            {
+                                let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
+                                    / sig_len as f64;
+                                let twiddle = angle * (angle.cos() + angle * angle.sin());
+                                sum = sum + twiddle * x;
+                            }
+
+                            // let _lock = LOCK.lock();
+                            *spec_bin += sum;
+                        }),
+                );
             });
     }
-    spectrum.wait_all();
+    spectrum.block_on_all(reqs);
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -465,6 +469,7 @@ fn dft_lamellar_array_opt_test(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
+    let mut reqs = vec![];
     unsafe {
         signal
             .onesided_iter()
@@ -474,7 +479,7 @@ fn dft_lamellar_array_opt_test(
             .enumerate()
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
-                let _ = spectrum.dist_iter_mut().enumerate().for_each_with_schedule(
+                reqs.push(spectrum.dist_iter_mut().enumerate().for_each_with_schedule(
                     Schedule::Dynamic,
                     move |(k, spec_bin)| {
                         let mut sum = 0f64;
@@ -492,10 +497,10 @@ fn dft_lamellar_array_opt_test(
                         // let _lock = LOCK.lock();
                         *spec_bin += sum;
                     },
-                );
+                ));
             });
     }
-    spectrum.wait_all();
+    spectrum.block_on_all(reqs);
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -508,6 +513,7 @@ fn dft_lamellar_array_opt_2(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
+    let mut reqs = vec![];
     signal
         .onesided_iter()
         .chunks(buf_size)
@@ -516,27 +522,29 @@ fn dft_lamellar_array_opt_2(
         .enumerate()
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
-            let _ = spectrum
-                .dist_iter_mut()
-                .enumerate()
-                .for_each(move |(k, mut spec_bin)| {
-                    let mut sum = 0f64;
-                    unsafe {
-                        for (j, &x) in signal
-                            .iter()
-                            .enumerate()
-                            .map(|(j, x)| (j + i * buf_size, x))
-                        {
-                            let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
-                                / sig_len as f64;
-                            let twiddle = angle * (angle.cos() + angle * angle.sin());
-                            sum = sum + twiddle * x;
+            reqs.push(
+                spectrum
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each(move |(k, mut spec_bin)| {
+                        let mut sum = 0f64;
+                        unsafe {
+                            for (j, &x) in signal
+                                .iter()
+                                .enumerate()
+                                .map(|(j, x)| (j + i * buf_size, x))
+                            {
+                                let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
+                                    / sig_len as f64;
+                                let twiddle = angle * (angle.cos() + angle * angle.sin());
+                                sum = sum + twiddle * x;
+                            }
                         }
-                    }
-                    spec_bin += sum;
-                });
+                        spec_bin += sum;
+                    }),
+            );
         });
-    spectrum.wait_all();
+    spectrum.block_on_all(reqs);
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -549,6 +557,7 @@ fn dft_lamellar_array_opt_3(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
+    let mut reqs = vec![];
     signal
         .onesided_iter()
         .chunks(buf_size)
@@ -557,28 +566,30 @@ fn dft_lamellar_array_opt_3(
         .enumerate()
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
-            let _ = spectrum
-                .dist_iter_mut() //this locks the LocalLockArray
-                .enumerate()
-                .for_each(move |(k, spec_bin)| {
-                    //we are accessing each element independently so free to mutate
-                    let mut sum = 0f64;
-                    unsafe {
-                        for (j, &x) in signal
-                            .iter()
-                            .enumerate()
-                            .map(|(j, x)| (j + i * buf_size, x))
-                        {
-                            let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
-                                / sig_len as f64;
-                            let twiddle = angle * (angle.cos() + angle * angle.sin());
-                            sum = sum + twiddle * x;
+            reqs.push(
+                spectrum
+                    .dist_iter_mut() //this locks the LocalLockArray
+                    .enumerate()
+                    .for_each(move |(k, spec_bin)| {
+                        //we are accessing each element independently so free to mutate
+                        let mut sum = 0f64;
+                        unsafe {
+                            for (j, &x) in signal
+                                .iter()
+                                .enumerate()
+                                .map(|(j, x)| (j + i * buf_size, x))
+                            {
+                                let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
+                                    / sig_len as f64;
+                                let twiddle = angle * (angle.cos() + angle * angle.sin());
+                                sum = sum + twiddle * x;
+                            }
                         }
-                    }
-                    *spec_bin += sum;
-                });
+                        *spec_bin += sum;
+                    }),
+            );
         });
-    spectrum.wait_all();
+    spectrum.block_on_all(reqs);
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -639,11 +650,10 @@ fn main() {
                 *i = rng.gen_range(0.0..1.0);
             }
             let full_signal_clone = full_signal.clone();
-            let _ = full_signal_array
+            full_signal_array
                 .dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = full_signal_clone.as_mut_slice().unwrap()[i]);
-            full_signal_array.wait_all();
+                .blocking_for_each(move |(i, x)| *x = full_signal_clone.as_mut_slice().unwrap()[i]);
             full_signal_array.barrier();
 
             partial_spectrum.put(my_pe, 0, full_spectrum.sub_region(0..array_len));
@@ -756,9 +766,9 @@ fn main() {
 
             //--------------lamellar array--------------------------
             unsafe {
-                let _ = full_spectrum_array
+                full_spectrum_array
                     .dist_iter_mut()
-                    .for_each(|elem| *elem = 0.0);
+                    .blocking_for_each(|elem| *elem = 0.0);
             }
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
@@ -804,9 +814,9 @@ fn main() {
 
             //------------optimized lamellar array----------------
             unsafe {
-                let _ = full_spectrum_array
+                full_spectrum_array
                     .dist_iter_mut()
-                    .for_each(|elem| *elem = 0.0);
+                    .blocking_for_each(|elem| *elem = 0.0);
             }
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
@@ -823,9 +833,9 @@ fn main() {
 
             //--------------lamellar array--------------------------
             unsafe {
-                let _ = full_spectrum_array
+                full_spectrum_array
                     .dist_iter_mut()
-                    .for_each(|elem| *elem = 0.0);
+                    .blocking_for_each(|elem| *elem = 0.0);
             }
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
@@ -871,9 +881,9 @@ fn main() {
             // ));
 
             world.barrier();
-            let _ = full_spectrum_array
+            full_spectrum_array
                 .dist_iter_mut()
-                .for_each(|elem| elem.store(0.0));
+                .blocking_for_each(|elem| elem.store(0.0));
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
             // let timer = Instant::now();
@@ -898,9 +908,9 @@ fn main() {
             ));
 
             world.barrier();
-            let _ = full_spectrum_array
+            full_spectrum_array
                 .dist_iter_mut()
-                .for_each(|elem| *elem = 0.0);
+                .blocking_for_each(|elem| *elem = 0.0);
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
             if my_pe == 0 {
diff --git a/examples/kernels/parallel_array_gemm.rs b/examples/kernels/parallel_array_gemm.rs
index 8cc2085f..6f027b5f 100644
--- a/examples/kernels/parallel_array_gemm.rs
+++ b/examples/kernels/parallel_array_gemm.rs
@@ -33,22 +33,23 @@ fn main() {
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
 
     //initialize matrices
-    let _ = a
-        .dist_iter_mut()
+    a.dist_iter_mut()
         .enumerate()
-        .for_each(|(i, x)| *x = i as f32);
-    let _ = b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
-        //need global index so use dist_iter
-        //identity matrix
-        let row = i / dim;
-        let col = i % dim;
-        if row == col {
-            *x = 1 as f32
-        } else {
-            *x = 0 as f32;
-        }
-    });
-    let _ = c.dist_iter_mut().for_each(|x| x.store(0.0));
+        .blocking_for_each(|(i, x)| *x = i as f32);
+    b.dist_iter_mut()
+        .enumerate()
+        .blocking_for_each(move |(i, x)| {
+            //need global index so use dist_iter
+            //identity matrix
+            let row = i / dim;
+            let col = i % dim;
+            if row == col {
+                *x = 1 as f32
+            } else {
+                *x = 0 as f32;
+            }
+        });
+    c.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
 
     world.wait_all();
     world.barrier();
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index 1c8382a2..3f948b36 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -40,22 +40,22 @@ fn main() {
     let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block); //col major
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
                                                                          //initialize
-    let _ = a
-        .dist_iter_mut()
+    a.dist_iter_mut()
         .enumerate()
-        .for_each(|(i, x)| *x = i as f32);
-    let _ = b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
-        //identity matrix
-        let row = i / dim;
-        let col = i % dim;
-        if row == col {
-            *x = 1 as f32
-        } else {
-            *x = 0 as f32;
-        }
-    });
-    let _ = c.dist_iter_mut().for_each(|x| x.store(0.0));
-    world.wait_all();
+        .blocking_for_each(|(i, x)| *x = i as f32);
+    b.dist_iter_mut()
+        .enumerate()
+        .blocking_for_each(move |(i, x)| {
+            //identity matrix
+            let row = i / dim;
+            let col = i % dim;
+            if row == col {
+                *x = 1 as f32
+            } else {
+                *x = 0 as f32;
+            }
+        });
+    c.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
     world.barrier();
     let a = a.into_read_only();
     let b = b.into_read_only();
@@ -74,104 +74,105 @@ fn main() {
     // we can then call dist_iter() on this array to iterate over the range in parallel on each PE
     let nblks_array =
         LocalLockArray::<Block>::new(&world, (n_blks * n_blks) * num_pes, Distribution::Block);
-    world.block_on(
-        nblks_array
-            .dist_iter_mut()
-            .enumerate()
-            .for_each(move |(g_i, x)| {
-                let i = g_i % (n_blks * n_blks);
-                x.j = i / n_blks;
-                x.k = i % n_blks
-            }),
-    );
+
+    nblks_array
+        .dist_iter_mut()
+        .enumerate()
+        .blocking_for_each(move |(g_i, x)| {
+            let i = g_i % (n_blks * n_blks);
+            x.j = i / n_blks;
+            x.k = i % n_blks
+        });
     let nblks_array = nblks_array.into_read_only();
 
     let start = std::time::Instant::now();
     let a_clone = a.clone();
     let b_clone = b.clone();
     let c_clone = c.clone();
-    let _ = nblks_array.dist_iter().for_each_async(move |block| {
-        let b = b_clone.clone();
-        let a: ReadOnlyArray<f32> = a_clone.clone();
-        let c = c_clone.clone();
-        async move {
-            //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
-            // iterate over submatrix rows of b
-            let j_blk = block.j;
-            let k_blk = block.k;
-            // println!("j_blk: {}, k_blk: {}", j_blk, k_blk);
-            let b_block = b
-                .onesided_iter() // OneSidedIterator (each pe will iterate through entirety of b)
-                .chunks(blocksize) //chunks columns by blocksize  -- manages efficent transfer and placement of data into a local memory region
-                .skip(k_blk * n_blks * blocksize + j_blk) // skip previously transfered submatrices
-                .step_by(n_blks) //grab chunk from next column in submatrix
-                // .buffered(100)
-                .into_stream() // convert to normal rust iterator
-                .take(blocksize) // we only need to take blocksize columns
-                .collect::<Vec<_>>()
-                .await; //gather local memory regions containing each columns data
-                        // println!("here");
-                        //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
-            let mut b_block_vec = vec![0.0; blocksize * blocksize];
-            for (j, col) in b_block.iter().enumerate() {
-                //(index, LocalMemRegion)
-                let b_block_col = &mut b_block_vec[j * blocksize..(j + 1) * blocksize];
-                b_block_col.copy_from_slice(unsafe { col.as_slice().unwrap() });
-            }
-            let b_block_vec = Arc::new(b_block_vec); //we will be sharing this submatrix in multiple tasks
-                                                     //--------------
-
-            for i_blk in 0..m_blks_pe {
-                // iterate of the local submatrix rows of a
-                let b_block_vec = b_block_vec.clone();
-                let a_vec = a
-                    .local_as_slice()
-                    .chunks(blocksize)
-                    .skip(i_blk * m_blks * blocksize + k_blk) //skip previously visited submatrices
-                    .step_by(m_blks) //grab chunk from the next row in submatrix
-                    .take(blocksize) //we only need to take blocksize rows
-                    .flatten()
-                    .copied() //get values instead of references
-                    .collect::<Vec<f32>>();
-                // -------------------------------
-                let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
-                unsafe {
-                    sgemm(
-                        blocksize,
-                        blocksize,
-                        blocksize,
-                        1.0,
-                        a_vec.as_ptr(),
-                        blocksize as isize,
-                        1,
-                        b_block_vec.as_ptr(),
-                        1,
-                        blocksize as isize,
-                        0.0,
-                        c_vec.as_mut_ptr(),
-                        blocksize as isize,
-                        1,
-                    );
+    nblks_array
+        .dist_iter()
+        .blocking_for_each_async(move |block| {
+            let b = b_clone.clone();
+            let a: ReadOnlyArray<f32> = a_clone.clone();
+            let c = c_clone.clone();
+            async move {
+                //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
+                // iterate over submatrix rows of b
+                let j_blk = block.j;
+                let k_blk = block.k;
+                // println!("j_blk: {}, k_blk: {}", j_blk, k_blk);
+                let b_block = b
+                    .onesided_iter() // OneSidedIterator (each pe will iterate through entirety of b)
+                    .chunks(blocksize) //chunks columns by blocksize  -- manages efficent transfer and placement of data into a local memory region
+                    .skip(k_blk * n_blks * blocksize + j_blk) // skip previously transfered submatrices
+                    .step_by(n_blks) //grab chunk from next column in submatrix
+                    // .buffered(100)
+                    .into_stream() // convert to normal rust iterator
+                    .take(blocksize) // we only need to take blocksize columns
+                    .collect::<Vec<_>>()
+                    .await; //gather local memory regions containing each columns data
+                            // println!("here");
+                            //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
+                let mut b_block_vec = vec![0.0; blocksize * blocksize];
+                for (j, col) in b_block.iter().enumerate() {
+                    //(index, LocalMemRegion)
+                    let b_block_col = &mut b_block_vec[j * blocksize..(j + 1) * blocksize];
+                    b_block_col.copy_from_slice(unsafe { col.as_slice().unwrap() });
                 }
+                let b_block_vec = Arc::new(b_block_vec); //we will be sharing this submatrix in multiple tasks
+                                                         //--------------
+
+                for i_blk in 0..m_blks_pe {
+                    // iterate of the local submatrix rows of a
+                    let b_block_vec = b_block_vec.clone();
+                    let a_vec = a
+                        .local_as_slice()
+                        .chunks(blocksize)
+                        .skip(i_blk * m_blks * blocksize + k_blk) //skip previously visited submatrices
+                        .step_by(m_blks) //grab chunk from the next row in submatrix
+                        .take(blocksize) //we only need to take blocksize rows
+                        .flatten()
+                        .copied() //get values instead of references
+                        .collect::<Vec<f32>>();
+                    // -------------------------------
+                    let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
+                    unsafe {
+                        sgemm(
+                            blocksize,
+                            blocksize,
+                            blocksize,
+                            1.0,
+                            a_vec.as_ptr(),
+                            blocksize as isize,
+                            1,
+                            b_block_vec.as_ptr(),
+                            1,
+                            blocksize as isize,
+                            0.0,
+                            c_vec.as_mut_ptr(),
+                            blocksize as isize,
+                            1,
+                        );
+                    }
 
-                let c_slice = c.mut_local_data();
+                    let c_slice = c.mut_local_data();
 
-                for row in 0..blocksize {
-                    let row_offset = (i_blk * blocksize + row) * n;
-                    for col in 0..blocksize {
-                        let col_offset = j_blk * blocksize + col;
-                        c_slice
-                            .at(row_offset + col_offset)
-                            .fetch_add(c_vec[row * blocksize + col]);
-                        //we know all updates to c are local so directly update the raw data
-                        // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
-                        // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
+                    for row in 0..blocksize {
+                        let row_offset = (i_blk * blocksize + row) * n;
+                        for col in 0..blocksize {
+                            let col_offset = j_blk * blocksize + col;
+                            c_slice
+                                .at(row_offset + col_offset)
+                                .fetch_add(c_vec[row * blocksize + col]);
+                            //we know all updates to c are local so directly update the raw data
+                            // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
+                            // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
+                        }
                     }
                 }
             }
-        }
-        // }
-    });
+            // }
+        });
     world.wait_all();
     world.barrier();
     let elapsed = start.elapsed().as_secs_f64();
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index 6c7b6054..a85f5ed7 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -1,3 +1,4 @@
+use futures_util::stream::StreamExt;
 use lamellar::array::prelude::*;
 /// ----------------Lamellar Parallel Blocked Array GEMM---------------------------------------------------
 /// This performs a distributed GEMM by partitioning the global matrices (stored in LamellarArrya)
@@ -9,7 +10,6 @@ use lamellar::array::prelude::*;
 /// to the C matrix are only performed locally, requiring no additional data transfer.
 ///----------------------------------------------------------------------------------
 use matrixmultiply::sgemm;
-use futures_util::stream::StreamExt;
 
 fn main() {
     let args: Vec<String> = std::env::args().collect();
@@ -50,12 +50,8 @@ fn main() {
     let c_init = c.dist_iter_mut().for_each(|x| *x = 0.0);
     let a = a.into_read_only();
     let b = b.into_read_only();
-    c.block_on(async move {
-        a_init.await;
-        b_init.await;
-        c_init.await;
-    });
-    c.barrier();
+    world.block_on_all((a, b, c));
+    worldc.barrier();
 
     let num_gops = ((2 * dim * dim * dim) - dim * dim) as f64 / 1_000_000_000.0; // accurate for square matrices
     let blocksize = std::cmp::min(1000000, dim / num_pes); // / 32;
@@ -72,19 +68,18 @@ fn main() {
     // we construct a global array where each pe will contain the sequence (0..n_blks)
     // we can then call dist_iter() on this array to iterate over the range in parallel on each PE
     let nblks_array = LocalLockArray::new(&world, n_blks * num_pes, Distribution::Block);
-    world.block_on(
-        nblks_array
-            .dist_iter_mut()
-            .enumerate()
-            .for_each(move |(i, x)| *x = i % n_blks),
-    );
+
+    nblks_array
+        .dist_iter_mut()
+        .enumerate()
+        .blocking_for_each(move |(i, x)| *x = i % n_blks);
+
     let m_blks_pe_array = LocalLockArray::new(&world, m_blks_pe * num_pes, Distribution::Block);
-    world.block_on(
-        m_blks_pe_array
-            .dist_iter_mut()
-            .enumerate()
-            .for_each(move |(i, x)| *x = i % m_blks_pe),
-    );
+
+    m_blks_pe_array
+        .dist_iter_mut()
+        .enumerate()
+        .blocking_for_each(move |(i, x)| *x = i % m_blks_pe);
     world.barrier();
     let nblks_array = nblks_array.into_read_only();
     let m_blks_pe_array = m_blks_pe_array.into_read_only();
@@ -99,7 +94,7 @@ fn main() {
         let b = b.clone();
         let c_clone = c_clone.clone();
         let m_blks_pe_array = m_blks_pe_array.clone();
-        async move{
+        async move {
             // println!("[{:?}] kblk {k_blk}", my_pe);
             //iterate over the submatrix cols of b, use dist_iter() so that we can launch transfers in parallel
             let my_p_blks = (p_blks_pe * my_pe..p_blks).chain(0..p_blks_pe * my_pe); //start with the local block then proceed in round robin fashion (should hopefully help all PEs requesting data from the same PE at the same time)
@@ -116,69 +111,72 @@ fn main() {
                     .take(blocksize) // we only need to take blocksize columns
                     .fold(Vec::new(), |mut vec, x| {
                         vec.extend_from_slice(unsafe { x.as_slice().unwrap() });
-                        async move{
-                            vec
-                        }
-                    }).await;
+                        async move { vec }
+                    })
+                    .await;
                 //--------------
                 let a = a.clone();
                 let c_clone = c_clone.clone();
-                let _inner_gemm = m_blks_pe_array.local_iter().for_each_async_with_schedule(
-                    Schedule::Chunk(m_blks_pe_array.len()),
-                    move |i_blk| {
-                        // println!("\t\tiblk {i_blk}");
-                        // iterate of the local submatrix rows of a
-
-                        let c = c_clone.clone();
-                        let b_block_vec = b_block.clone();
-                        let a_vec: Vec<f32> = a
-                            .local_as_slice()
-                            .chunks(blocksize) //chunks rows by blocksize
-                            .skip(i_blk * m_blks * blocksize + *k_blk) //skip previously visited submatrices
-                            .step_by(m_blks) //grab chunk from the next row in submatrix
-                            .take(blocksize) //we only need to take blocksize rows
-                            .fold(Vec::new(), |mut vec, x| {
-                                vec.extend(x);
-                                vec
-                            });
-                        
-                        let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
-                        unsafe {
-                            sgemm(
-                                blocksize,
-                                blocksize,
-                                blocksize,
-                                1.0,
-                                a_vec.as_ptr(),
-                                blocksize as isize,
-                                1,
-                                b_block_vec.as_ptr(),
-                                1,
-                                blocksize as isize,
-                                0.0,
-                                c_vec.as_mut_ptr(),
-                                blocksize as isize,
-                                1,
-                            );
-                        }
-                        async move {
-                            let mut c_slice = c.write_local_data().await; //this locks the array
-
-                            for row in 0..blocksize {
-                                let row_offset = (i_blk * blocksize + row) * n;
-                                for col in 0..blocksize {
-                                    let col_offset = j_blk * blocksize + col;
-                                    c_slice[row_offset + col_offset] += c_vec[row * blocksize + col];
-                                    //we know all updates to c are local so directly update the raw data
-                                    // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
-                                    // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
+                m_blks_pe_array
+                    .local_iter()
+                    .for_each_async_with_schedule(
+                        Schedule::Chunk(m_blks_pe_array.len()),
+                        move |i_blk| {
+                            // println!("\t\tiblk {i_blk}");
+                            // iterate of the local submatrix rows of a
+
+                            let c = c_clone.clone();
+                            let b_block_vec = b_block.clone();
+                            let a_vec: Vec<f32> = a
+                                .local_as_slice()
+                                .chunks(blocksize) //chunks rows by blocksize
+                                .skip(i_blk * m_blks * blocksize + *k_blk) //skip previously visited submatrices
+                                .step_by(m_blks) //grab chunk from the next row in submatrix
+                                .take(blocksize) //we only need to take blocksize rows
+                                .fold(Vec::new(), |mut vec, x| {
+                                    vec.extend(x);
+                                    vec
+                                });
+
+                            let mut c_vec = vec![0.0; blocksize * blocksize]; // MatrixMultiple lib stores result in a contiguous memory segment
+                            unsafe {
+                                sgemm(
+                                    blocksize,
+                                    blocksize,
+                                    blocksize,
+                                    1.0,
+                                    a_vec.as_ptr(),
+                                    blocksize as isize,
+                                    1,
+                                    b_block_vec.as_ptr(),
+                                    1,
+                                    blocksize as isize,
+                                    0.0,
+                                    c_vec.as_mut_ptr(),
+                                    blocksize as isize,
+                                    1,
+                                );
+                            }
+                            async move {
+                                let mut c_slice = c.write_local_data().await; //this locks the array
+
+                                for row in 0..blocksize {
+                                    let row_offset = (i_blk * blocksize + row) * n;
+                                    for col in 0..blocksize {
+                                        let col_offset = j_blk * blocksize + col;
+                                        c_slice[row_offset + col_offset] +=
+                                            c_vec[row * blocksize + col];
+                                        //we know all updates to c are local so directly update the raw data
+                                        // we could use the array.add interface by calculating the global index: let g_i_blk = i_blk + my_pe *m_blks_pe; and replacing it in row_offset
+                                        // c.add(row_offset+col_offset,c_vec[row*blocksize + col]); -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
+                                    }
                                 }
                             }
-                        }
-                        // println!("[{:?}] kblk {k_blk} jblk {j_blk} iblk {i_blk}", my_pe);
-                        // });
-                    },
-                );
+                            // println!("[{:?}] kblk {k_blk} jblk {j_blk} iblk {i_blk}", my_pe);
+                            // });
+                        },
+                    )
+                    .await;
             }
         }
         // println!(
diff --git a/examples/kernels/serial_array_gemm.rs b/examples/kernels/serial_array_gemm.rs
index 912a0a46..b98f68da 100644
--- a/examples/kernels/serial_array_gemm.rs
+++ b/examples/kernels/serial_array_gemm.rs
@@ -28,23 +28,23 @@ fn main() {
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
                                                                          //initialize matrices
 
-    let _ = a
-        .dist_iter_mut()
+    a.dist_iter_mut()
         .enumerate()
-        .for_each(|(i, x)| *x = i as f32);
-    let _ = b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
-        //identity matrix
-        let row = i / dim;
-        let col = i % dim;
-        if row == col {
-            *x = 1 as f32
-        } else {
-            *x = 0 as f32;
-        }
-    });
-    let _ = c.dist_iter_mut().for_each(|x| x.store(0.0));
+        .blocking_for_each(|(i, x)| *x = i as f32);
+    b.dist_iter_mut()
+        .enumerate()
+        .blocking_for_each(move |(i, x)| {
+            //identity matrix
+            let row = i / dim;
+            let col = i % dim;
+            if row == col {
+                *x = 1 as f32
+            } else {
+                *x = 0 as f32;
+            }
+        });
+    c.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
 
-    world.wait_all();
     world.barrier();
 
     let a = a.into_read_only();

From 40dcbf9cb40d950f914fc29a660087fe7cee0c97 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 24 Jul 2024 11:27:10 -0700
Subject: [PATCH 054/116] implement block_on_all and spawn_task apis

---
 src/active_messaging.rs             | 105 +++++++++++++++++++++++++++-
 src/lamellar_task_group.rs          |  21 +++++-
 src/lamellar_team.rs                |  25 ++++++-
 src/lamellar_world.rs               |  25 ++++++-
 src/scheduler.rs                    |  77 ++++++++++++++++++++
 src/scheduler/async_std_executor.rs |  14 +++-
 src/scheduler/tokio_executor.rs     |  14 +++-
 src/scheduler/work_stealing.rs      |  20 +++++-
 src/scheduler/work_stealing2.rs     |  18 ++++-
 src/scheduler/work_stealing3.rs     |  27 ++++++-
 10 files changed, 337 insertions(+), 9 deletions(-)

diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 75f62497..033870bd 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -638,7 +638,7 @@ use crate::lamellar_arch::IdError;
 use crate::lamellar_request::{InternalResult, LamellarRequestResult};
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::NetMemRegionHandle;
-use crate::scheduler::{Executor, LamellarExecutor, ReqId};
+use crate::scheduler::{Executor, LamellarExecutor, LamellarTask, ReqId};
 
 use async_trait::async_trait;
 use futures_util::Future;
@@ -1197,6 +1197,57 @@ pub trait ActiveMessaging {
     ///```
     fn async_barrier(&self) -> impl Future<Output = ()> + Send;
 
+    #[doc(alias("One-sided", "onesided"))]
+    /// Spawns a future on the worker threadpool
+    ///
+    /// This function returns a task handle that can be used to await the spawned future
+    ///
+    /// Users can spawn any future, including those returned from lamellar remote operations
+    ///
+    /// # One-sided Operation
+    /// this is not a distributed synchronization primitive and only blocks the calling thread until the given future has completed on the calling PE
+    ///
+    /// # Examples
+    ///```no_run  
+    /// # use lamellar::active_messaging::prelude::*;
+    /// use async_std::fs::File;
+    /// use async_std::prelude::*;
+    /// # #[lamellar::AmData(Debug,Clone)]
+    /// # struct Am{
+    /// # // can contain anything that impls Sync, Send  
+    /// #     val: usize,
+    /// # }
+    /// #
+    /// # #[lamellar::am]
+    /// # impl LamellarAM for Am{
+    /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
+    /// #         //do some remote computation
+    /// #          println!("hello from PE{}",self.val);
+    /// #         lamellar::current_pe //return the executing pe
+    /// #     }
+    /// # }
+    /// #
+    /// # let world = lamellar::LamellarWorldBuilder::new().build();
+    /// # let num_pes = world.num_pes();
+    /// let request = world.exec_am_all(Am{val: world.my_pe()}); //launch am locally
+    /// let result = world.block_on(request); //block until am has executed
+    /// // you can also directly pass an async block
+    /// let world_clone = world.clone();
+    /// world.block_on(async move {
+    ///     let mut file = async_std::fs::File::open("a.txt").await.unwrap();
+    ///     let mut buf = vec![0u8;1000];
+    ///     for pe in 0..num_pes{
+    ///         let data = file.read(&mut buf).await.unwrap();
+    ///         world_clone.exec_am_pe(pe,Am{val: data}).await;
+    ///     }
+    ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
+    /// });
+    ///```
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send;
+
     #[doc(alias("One-sided", "onesided"))]
     /// Run a future to completion on the current thread
     ///
@@ -1244,6 +1295,58 @@ pub trait ActiveMessaging {
     /// });
     ///```
     fn block_on<F: Future>(&self, f: F) -> F::Output;
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Run a collection of futures to completion
+    ///
+    /// This function will block the caller until the given future has completed, the future is executed within the Lamellar threadpool
+    ///
+    /// Users can await any future, including those returned from lamellar remote operations
+    ///
+    /// # One-sided Operation
+    /// this is not a distributed synchronization primitive and only blocks the calling thread until the given future has completed on the calling PE
+    ///
+    /// # Examples
+    ///```no_run  
+    /// # use lamellar::active_messaging::prelude::*;
+    /// use async_std::fs::File;
+    /// use async_std::prelude::*;
+    /// # #[lamellar::AmData(Debug,Clone)]
+    /// # struct Am{
+    /// # // can contain anything that impls Sync, Send  
+    /// #     val: usize,
+    /// # }
+    /// #
+    /// # #[lamellar::am]
+    /// # impl LamellarAM for Am{
+    /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
+    /// #         //do some remote computation
+    /// #          println!("hello from PE{}",self.val);
+    /// #         lamellar::current_pe //return the executing pe
+    /// #     }
+    /// # }
+    /// #
+    /// # let world = lamellar::LamellarWorldBuilder::new().build();
+    /// # let num_pes = world.num_pes();
+    /// let request = world.exec_am_all(Am{val: world.my_pe()}); //launch am locally
+    /// let result = world.block_on(request); //block until am has executed
+    /// // you can also directly pass an async block
+    /// let world_clone = world.clone();
+    /// world.block_on(async move {
+    ///     let mut file = async_std::fs::File::open("a.txt").await.unwrap();
+    ///     let mut buf = vec![0u8;1000];
+    ///     for pe in 0..num_pes{
+    ///         let data = file.read(&mut buf).await.unwrap();
+    ///         world_clone.exec_am_pe(pe,Am{val: data}).await;
+    ///     }
+    ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
+    /// });
+    ///```
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send;
 }
 
 #[async_trait]
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 498a2671..abe0e0ef 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -7,12 +7,13 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_request::*;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::MemRegionHandleInner;
-use crate::scheduler::{ReqId, Scheduler};
+use crate::scheduler::{LamellarTask, ReqId, Scheduler};
 use crate::Darc;
 
 // use crossbeam::utils::CachePadded;
 // use futures_util::StreamExt;
 
+use futures_util::future::join_all;
 use futures_util::{Future, StreamExt};
 use parking_lot::Mutex;
 use pin_project::{pin_project, pinned_drop};
@@ -578,6 +579,13 @@ impl ActiveMessaging for LamellarTaskGroup {
         self.exec_am_local_inner(am)
     }
 
+    fn spawn<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.team.scheduler.spawn_task(task)
+    }
     fn block_on<F>(&self, f: F) -> F::Output
     where
         F: Future,
@@ -586,6 +594,17 @@ impl ActiveMessaging for LamellarTaskGroup {
         self.team.scheduler.block_on(f)
         // )
     }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        self.team.scheduler.block_on(join_all(
+            iter.into_iter()
+                .map(|task| self.team.scheduler.spawn_task(task)),
+        ))
+    }
 }
 
 impl LamellarTaskGroup {
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 2a315ea8..6983750a 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -11,7 +11,7 @@ use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, LamellarMemoryRegion,
     MemoryRegion, RemoteMemoryRegion,
 };
-use crate::scheduler::{ReqId, Scheduler};
+use crate::scheduler::{LamellarTask, ReqId, Scheduler};
 #[cfg(feature = "nightly")]
 use crate::utils::ser_closure;
 
@@ -20,6 +20,7 @@ use std::collections::hash_map::DefaultHasher;
 use std::hash::{Hash, Hasher};
 // use std::any;
 use core::pin::Pin;
+use futures_util::future::join_all;
 use futures_util::Future;
 use parking_lot::{Mutex, RwLock};
 use std::collections::HashMap;
@@ -523,6 +524,15 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.async_barrier()
     }
 
+    fn spawn<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+        self.team.scheduler.spawn_task(task)
+    }
+
     fn block_on<F: Future>(&self, f: F) -> F::Output {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
@@ -530,6 +540,19 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.scheduler.block_on(f)
         // )
     }
+
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+        self.team.scheduler.block_on(join_all(
+            iter.into_iter()
+                .map(|task| self.team.scheduler.spawn_task(task)),
+        ))
+    }
 }
 
 impl RemoteMemoryRegion for Arc<LamellarTeam> {
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 30df829f..6d91eaf1 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -5,12 +5,13 @@ use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, RemoteMemoryRegion,
 };
-use crate::scheduler::{create_scheduler, ExecutorType};
+use crate::scheduler::{create_scheduler, ExecutorType, LamellarTask};
 use crate::{active_messaging::*, config};
 // use log::trace;
 
 //use tracing::*;
 
+use futures_util::future::join_all;
 use futures_util::Future;
 use parking_lot::RwLock;
 use pin_weak::sync::PinWeak;
@@ -84,6 +85,14 @@ impl ActiveMessaging for LamellarWorld {
         self.team.async_barrier()
     }
 
+    fn spawn<F>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.team_rt.scheduler.spawn_task(f)
+    }
+
     fn block_on<F>(&self, f: F) -> F::Output
     where
         F: Future,
@@ -92,6 +101,20 @@ impl ActiveMessaging for LamellarWorld {
         self.team_rt.scheduler.block_on(f)
         // )
     }
+
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        // trace_span!("block_on_all").in_scope(||
+        self.team_rt.scheduler.block_on(join_all(
+            iter.into_iter()
+                .map(|task| self.team_rt.scheduler.spawn_task(task)),
+        ))
+        // )
+    }
 }
 
 impl RemoteMemoryRegion for LamellarWorld {
diff --git a/src/scheduler.rs b/src/scheduler.rs
index e90f9990..d6cec689 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -8,8 +8,11 @@ use crate::lamellae::{Des, Lamellae, SerializedData};
 
 use enum_dispatch::enum_dispatch;
 use futures_util::Future;
+use pin_project::{pin_project, pinned_drop};
+use std::pin::{pin, Pin};
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::task::{Context, Poll};
 
 pub(crate) mod work_stealing;
 use work_stealing::WorkStealing;
@@ -90,8 +93,74 @@ pub enum ExecutorType {
     // Dyn(impl LamellarExecutor),
 }
 
+#[derive(Debug)]
+#[pin_project]
+pub struct LamellarTask<T> {
+    #[pin]
+    task: LamellarTaskInner<T>,
+}
+
+impl<T> Future for LamellarTask<T> {
+    type Output = T;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.project().task.poll(cx)
+    }
+}
+
+#[derive(Debug)]
+pub(crate) enum LamellarTaskInner<T> {
+    LamellarTask(Option<async_task::Task<T, usize>>),
+    AsyncStdTask(async_std::task::JoinHandle<T>),
+    #[cfg(feature = "tokio-executor")]
+    TokioTask(tokio::task::JoinHandle<T>),
+    Dropped,
+}
+
+impl<T> Drop for LamellarTaskInner<T> {
+    fn drop(self: &mut Self) {
+        // let mut dropped = LamellarTaskInner::Dropped;
+
+        // std::mem::swap(&mut dropped, self);
+        match self {
+            LamellarTaskInner::LamellarTask(task) => {
+                task.take().expect("task already taken").detach();
+            }
+            LamellarTaskInner::AsyncStdTask(_task) => {}
+            #[cfg(feature = "tokio-executor")]
+            LamellarTaskInner::TokioTask(task) => {}
+            LamellarTaskInner::Dropped => {}
+        }
+    }
+}
+
+impl<T> Future for LamellarTaskInner<T> {
+    type Output = T;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        unsafe {
+            match self.get_unchecked_mut() {
+                LamellarTaskInner::LamellarTask(task) => {
+                    if let Some(task) = task {
+                        Pin::new_unchecked(task).poll(cx)
+                    } else {
+                        unreachable!()
+                    }
+                }
+                LamellarTaskInner::AsyncStdTask(task) => Pin::new_unchecked(task).poll(cx),
+                #[cfg(feature = "tokio-executor")]
+                LamellarTaskInner::TokioTask(task) => Pin::new_unchecked(task).poll(cx),
+                LamellarTaskInner::Dropped => unreachable!(),
+            }
+        }
+    }
+}
+
 #[enum_dispatch]
 pub(crate) trait LamellarExecutor {
+    fn spawn_task<F>(&self, future: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send;
+
     fn submit_task<F>(&self, future: F)
     where
         F: Future + Send + 'static,
@@ -298,6 +367,14 @@ impl Scheduler {
         self.executor.submit_task(am_future);
     }
 
+    pub(crate) fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.executor.spawn_task(task)
+    }
+
     pub(crate) fn submit_task<F>(&self, task: F)
     where
         F: Future<Output = ()> + Send + 'static,
diff --git a/src/scheduler/async_std_executor.rs b/src/scheduler/async_std_executor.rs
index 89a35909..3067cc78 100644
--- a/src/scheduler/async_std_executor.rs
+++ b/src/scheduler/async_std_executor.rs
@@ -1,4 +1,4 @@
-use crate::scheduler::LamellarExecutor;
+use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner};
 
 use async_std::task;
 
@@ -10,6 +10,18 @@ pub(crate) struct AsyncStdRt {
 }
 
 impl LamellarExecutor for AsyncStdRt {
+    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("spawn_task").in_scope(|| {
+        let task = task::spawn(task);
+        LamellarTask {
+            task: LamellarTaskInner::AsyncStdTask(task),
+        }
+        // })
+    }
     fn submit_task<F>(&self, task: F)
     where
         F: Future + Send + 'static,
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index eafd942b..117b36de 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -1,4 +1,4 @@
-use crate::scheduler::LamellarExecutor;
+use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner};
 
 use tokio::runtime::Runtime;
 
@@ -11,6 +11,18 @@ pub(crate) struct TokioRt {
 }
 
 impl LamellarExecutor for TokioRt {
+    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("spawn_task").in_scope(|| {
+        let task = self.rt.spawn(task);
+        LamellarTask {
+            task: LamellarTaskInner::TokioTask(task),
+        }
+        // })
+    }
     fn submit_task<F>(&self, task: F)
     where
         F: Future + Send + 'static,
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index f20a8b56..4b0ca01b 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -1,5 +1,5 @@
 use crate::env_var::config;
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus};
 
 //use tracing::*;
 
@@ -130,6 +130,24 @@ pub(crate) struct WorkStealing {
 }
 
 impl LamellarExecutor for WorkStealing {
+    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.schedule();
+        LamellarTask {
+            task: LamellarTaskInner::LamellarTask(Some(task)),
+        }
+        // });
+    }
     fn submit_task<F>(&self, task: F)
     where
         F: Future + Send + 'static,
diff --git a/src/scheduler/work_stealing2.rs b/src/scheduler/work_stealing2.rs
index 22b1b4ee..965ca933 100644
--- a/src/scheduler/work_stealing2.rs
+++ b/src/scheduler/work_stealing2.rs
@@ -1,5 +1,5 @@
 use crate::env_var::config;
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus};
 use crate::MAIN_THREAD;
 
 //use tracing::*;
@@ -222,6 +222,22 @@ pub(crate) struct WorkStealing2 {
 }
 
 impl LamellarExecutor for WorkStealing2 {
+    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        let work_inj = self.get_injector();
+        let schedule = move |runnable| work_inj.push(runnable);
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.schedule();
+        LamellarTask {
+            task: LamellarTaskInner::LamellarTask(Some(task)),
+        }
+    }
     fn submit_task<F>(&self, task: F)
     where
         F: Future + Send + 'static,
diff --git a/src/scheduler/work_stealing3.rs b/src/scheduler/work_stealing3.rs
index 405de9ff..5264c710 100644
--- a/src/scheduler/work_stealing3.rs
+++ b/src/scheduler/work_stealing3.rs
@@ -1,5 +1,5 @@
 use crate::env_var::config;
-use crate::scheduler::{LamellarExecutor, SchedulerStatus};
+use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus};
 use crate::MAIN_THREAD;
 
 //use tracing::*;
@@ -142,6 +142,31 @@ pub(crate) struct WorkStealing3 {
 }
 
 impl LamellarExecutor for WorkStealing3 {
+    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        // trace_span!("submit_task").in_scope(|| {
+        let work_inj = self.work_inj.clone();
+        let schedule = move |runnable| {
+            // if thread::current().id() == *MAIN_THREAD {
+            work_inj.push(runnable);
+            // } else {
+            //     WORK_Q.get().unwrap().push(runnable);
+            // }
+        };
+        let (runnable, task) = Builder::new()
+            .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
+            .spawn(move |_task_id| async move { task.await }, schedule);
+
+        runnable.schedule();
+        LamellarTask {
+            task: LamellarTaskInner::LamellarTask(Some(task)),
+        }
+        // });
+    }
+
     fn submit_task<F>(&self, task: F)
     where
         F: Future + Send + 'static,

From 0bf9928f3174089dd28d66e73f016fa2c6f02b8f Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 24 Jul 2024 17:46:56 -0700
Subject: [PATCH 055/116] switch to .block() and .spawn() api instead of
 blocking_ and spawn_ iterators

---
 .../array_consumer_schedules.rs               |   10 +-
 examples/array_examples/array_ops.rs          |   67 +-
 examples/array_examples/array_put_get.rs      |    2 +-
 .../array_examples/atomic_compare_exchange.rs |    9 +-
 examples/array_examples/dist_array_reduce.rs  |    8 +-
 .../array_examples/distributed_iteration.rs   |   52 +-
 examples/array_examples/global_lock_array.rs  |    5 +-
 examples/array_examples/local_iteration.rs    |   52 +-
 examples/bandwidths/atomic_array_get_bw.rs    |    3 +-
 examples/bandwidths/atomic_array_put_bw.rs    |    6 +-
 .../global_lock_atomic_array_get_bw.rs        |    4 +-
 .../global_lock_atomic_array_put_bw.rs        |    8 +-
 .../local_lock_atomic_array_get_bw.rs         |    4 +-
 .../local_lock_atomic_array_put_bw.rs         |    8 +-
 examples/bandwidths/readonly_array_get_bw.rs  |    4 +-
 .../readonly_array_get_unchecked_bw.rs        |    4 +-
 examples/bandwidths/unsafe_array_get_bw.rs    |    4 +-
 .../unsafe_array_get_unchecked_bw.rs          |    5 +-
 examples/bandwidths/unsafe_array_store_bw.rs  |    3 +-
 .../hello_world_array_iteration.rs            |    5 +-
 examples/kernels/dft_proxy.rs                 |  201 +--
 examples/kernels/parallel_array_gemm.rs       |   15 +-
 .../kernels/parallel_blocked_array_gemm.rs    |   20 +-
 .../safe_parallel_blocked_array_gemm.rs       |   11 +-
 examples/kernels/serial_array_gemm.rs         |   10 +-
 src/array.rs                                  |   32 +-
 src/array/iterator/distributed_iterator.rs    | 1131 ++++++++++-------
 .../distributed_iterator/consumer/collect.rs  |    7 +
 .../distributed_iterator/consumer/count.rs    |    7 +
 .../distributed_iterator/consumer/for_each.rs |    8 +
 .../distributed_iterator/consumer/reduce.rs   |   15 +-
 .../distributed_iterator/consumer/sum.rs      |    8 +
 src/array/iterator/local_iterator.rs          |  862 +++++++------
 .../local_iterator/consumer/collect.rs        |    8 +
 .../iterator/local_iterator/consumer/count.rs |    8 +
 .../local_iterator/consumer/for_each.rs       |    8 +
 .../local_iterator/consumer/reduce.rs         |   14 +-
 .../iterator/local_iterator/consumer/sum.rs   |   13 +-
 src/array/unsafe/iteration/distributed.rs     |  127 +-
 src/array/unsafe/iteration/local.rs           |   86 +-
 src/scheduler.rs                              |    5 +-
 tests/array/arithmetic_ops/add_test.rs        |   17 +-
 tests/array/arithmetic_ops/div_test.rs        |   26 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |   61 +-
 tests/array/arithmetic_ops/fetch_div_test.rs  |   36 +-
 tests/array/arithmetic_ops/fetch_mul_test.rs  |   30 +-
 tests/array/arithmetic_ops/fetch_rem_test.rs  |   36 +-
 tests/array/arithmetic_ops/fetch_sub_test.rs  |   28 +-
 tests/array/arithmetic_ops/mul_test.rs        |   28 +-
 tests/array/arithmetic_ops/rem_test.rs        |   28 +-
 tests/array/arithmetic_ops/sub_test.rs        |   30 +-
 .../array/atomic_ops/compare_exchange_test.rs |   26 +-
 tests/array/atomic_ops/load_store_test.rs     |   24 +-
 tests/array/atomic_ops/swap_test.rs           |   26 +-
 tests/array/bitwise_ops/and_test.rs           |   28 +-
 tests/array/bitwise_ops/fetch_and_test.rs     |   28 +-
 tests/array/bitwise_ops/fetch_or_test.rs      |   28 +-
 tests/array/bitwise_ops/fetch_xor_test.rs     |   28 +-
 tests/array/bitwise_ops/or_test.rs            |   28 +-
 tests/array/bitwise_ops/xor_test.rs           |   25 +-
 tests/array/rdma/blocking_get_test.rs         |   48 +-
 tests/array/rdma/get_test.rs                  |   62 +-
 tests/array/rdma/put_test.rs                  |   20 +-
 63 files changed, 2055 insertions(+), 1495 deletions(-)

diff --git a/examples/array_examples/array_consumer_schedules.rs b/examples/array_examples/array_consumer_schedules.rs
index 6f8ff658..23040adb 100644
--- a/examples/array_examples/array_consumer_schedules.rs
+++ b/examples/array_examples/array_consumer_schedules.rs
@@ -17,10 +17,11 @@ fn for_each_with_schedule(
     array
         .local_iter()
         .filter(|e| e.load() % 2 == 0)
-        .blocking_for_each_with_schedule(schedule, move |e| {
+        .for_each_with_schedule(schedule, move |e| {
             std::thread::sleep(Duration::from_millis((e.load() * 1) as u64));
             *tc.lock().entry(std::thread::current().id()).or_insert(0) += 1;
-        });
+        })
+        .block();
     array.barrier();
     println!("elapsed time {:?}", timer.elapsed().as_secs_f64());
     println!("counts {:?}", thread_cnts.lock());
@@ -109,10 +110,11 @@ fn main() {
     let _my_pe = world.my_pe();
     let _num_pes = world.num_pes();
     let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    block_array
+    let _ = block_array
         .dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(i, e)| e.store(i));
+        .for_each(move |(i, e)| e.store(i))
+        .spawn();
     world.wait_all();
     block_array.print();
 
diff --git a/examples/array_examples/array_ops.rs b/examples/array_examples/array_ops.rs
index 98a8f760..f0e952f7 100644
--- a/examples/array_examples/array_ops.rs
+++ b/examples/array_examples/array_ops.rs
@@ -86,8 +86,8 @@ fn test_add<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
     array.barrier();
     array.print();
     array.barrier();
@@ -123,8 +123,8 @@ fn test_sub<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
     array.barrier();
     array.print();
     array.barrier();
@@ -154,8 +154,8 @@ fn test_mul<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
     array.barrier();
     array.print();
     array.barrier();
@@ -185,8 +185,9 @@ fn test_div<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
@@ -216,8 +217,9 @@ fn test_rem<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
@@ -247,8 +249,9 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
@@ -261,8 +264,9 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
     array.barrier();
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
@@ -283,8 +287,9 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
@@ -297,8 +302,9 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.barrier();
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
@@ -319,8 +325,9 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
@@ -333,8 +340,9 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.barrier();
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
@@ -357,8 +365,9 @@ fn test_store_load<T: std::fmt::Debug + ElementOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
@@ -389,8 +398,9 @@ fn test_shl<T: std::fmt::Debug + ElementShiftOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
@@ -420,8 +430,9 @@ fn test_shr<T: std::fmt::Debug + ElementShiftOps + 'static>(
 ) {
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| elem.store(init_val));
-    array.wait_all();
+        .for_each(move |elem| elem.store(init_val))
+        .block();
+
     array.barrier();
     array.print();
     array.barrier();
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index 11fa7694..2a06bcce 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -2,7 +2,7 @@ use lamellar::array::prelude::*;
 use lamellar::memregion::prelude::*;
 
 fn initialize_array(array: &UnsafeArray<usize>) {
-    unsafe { array.dist_iter_mut().blocking_for_each(|x| *x = 0) };
+    unsafe { array.dist_iter_mut().for_each(|x| *x = 0).block() };
     array.wait_all();
     array.barrier();
 }
diff --git a/examples/array_examples/atomic_compare_exchange.rs b/examples/array_examples/atomic_compare_exchange.rs
index c8804116..2328347e 100644
--- a/examples/array_examples/atomic_compare_exchange.rs
+++ b/examples/array_examples/atomic_compare_exchange.rs
@@ -24,7 +24,7 @@ fn main() {
     let my_pe = world.my_pe();
 
     let array = AtomicArray::<usize>::new(world.team(), num_pes * 2, Distribution::Block);
-    array.dist_iter_mut().blocking_for_each(|x| x.store(0)); //initialize array -- use atomic store
+    array.dist_iter_mut().for_each(|x| x.store(0)).block(); //initialize array -- use atomic store
     array.wait_all();
     array.barrier();
 
@@ -46,7 +46,7 @@ fn main() {
     array.print();
 
     let array_2 = AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic);
-    array_2.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
+    array_2.dist_iter_mut().for_each(|x| x.store(0.0)).spawn();
     array_2.wait_all();
     array_2.barrier();
 
@@ -85,7 +85,7 @@ fn main() {
     array
         .dist_iter()
         .enumerate()
-        .blocking_for_each_async(move |(i, e)| {
+        .for_each_async(move |(i, e)| {
             let a2c = array_2.clone();
             async move {
                 let res = a2c
@@ -100,7 +100,8 @@ fn main() {
                     }
                 }
             }
-        });
+        })
+        .block();
     println!("num_failed {num_failed} num_ok {num_ok}");
     // array2.print();
 }
diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index d2fb8e6a..18d900fe 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -133,19 +133,21 @@ fn main() {
         cyclic_sum, cyclic_dist_time, block_sum, block_dist_time
     );
 
-    unsafe { cyclic_array.dist_iter_mut().blocking_for_each(|x| *x += *x) };
+    unsafe { cyclic_array.dist_iter_mut().for_each(|x| *x += *x).block() };
     unsafe {
         cyclic_array
             .dist_iter()
             .enumerate()
-            .blocking_for_each(|x| println!("x: {:?}", x));
+            .for_each(|x| println!("x: {:?}", x))
+            .block();
     }
 
     unsafe {
         block_array
             .dist_iter()
             .enumerate()
-            .blocking_for_each(|x| println!("x: {:?}", x))
+            .for_each(|x| println!("x: {:?}", x))
+            .block()
     };
     let block_array = block_array.into_read_only();
     let _ = block_array.blocking_sum();
diff --git a/examples/array_examples/distributed_iteration.rs b/examples/array_examples/distributed_iteration.rs
index 86fea09c..c14691e3 100644
--- a/examples/array_examples/distributed_iteration.rs
+++ b/examples/array_examples/distributed_iteration.rs
@@ -23,8 +23,11 @@ fn main() {
     //for example lets initialize our arrays, where we store the value of my_pe to each local element a pe owns
     block_dist_iter
         .enumerate()
-        .blocking_for_each(move |(i, elem)| elem.store(i));
-    cyclic_dist_iter.blocking_for_each(move |elem| elem.store(my_pe));
+        .for_each(move |(i, elem)| elem.store(i))
+        .block();
+    cyclic_dist_iter
+        .for_each(move |elem| elem.store(my_pe))
+        .block();
 
     // let block_array = block_array.into_read_only();
     block_array.print();
@@ -53,7 +56,7 @@ fn main() {
         .skip(2)
         .enumerate()
         .step_by(3)
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -61,7 +64,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     println!("--------------------------------------------------------");
     println!("cyclic skip enumerate");
@@ -70,7 +74,7 @@ fn main() {
         .dist_iter()
         .enumerate()
         .skip(2)
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -78,7 +82,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     println!("--------------------------------------------------------");
 
@@ -100,14 +105,15 @@ fn main() {
             );
             async move { (i, elem, barray.load(i).await) }
         })
-        .blocking_for_each_async(move |i| async move {
+        .for_each_async(move |i| async move {
             println!(
                 "[pe({:?})-{:?}] for each {:?}",
                 my_pe,
                 std::thread::current().id(),
                 i.await
             );
-        });
+        })
+        .block();
     block_array.print();
 
     println!("--------------------------------------------------------");
@@ -135,7 +141,7 @@ fn main() {
         .dist_iter()
         .enumerate()
         .filter(|(_, elem)| elem.load() % 4 == 0)
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -143,7 +149,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     println!("--------------------------------------------------------");
     println!("block enumerate filter_map");
@@ -157,7 +164,7 @@ fn main() {
                 None
             }
         })
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -165,7 +172,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
     println!("--------------------------------------------------------");
     println!("filter_map collect");
     let new_block_array = block_array.block_on(
@@ -191,7 +199,7 @@ fn main() {
         .dist_iter()
         .skip(10)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -199,7 +207,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     println!("--------------------------------------------------------");
     println!("block skip  step_by enumerate");
@@ -208,7 +217,7 @@ fn main() {
         .skip(10)
         .step_by(3)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -216,7 +225,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     println!("--------------------------------------------------------");
     println!("block take skip enumerate");
@@ -225,7 +235,7 @@ fn main() {
         .take(60)
         .skip(10)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -233,7 +243,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     println!("--------------------------------------------------------");
     println!("block take skip take enumerate");
@@ -243,7 +254,7 @@ fn main() {
         .skip(10)
         .take(30)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -251,7 +262,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     println!("--------------------------------------------------------");
     println!("block filter count");
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 9d0fef63..58483778 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -51,13 +51,14 @@ fn main() {
         .blocking_read_lock()
         .dist_iter()
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "{my_pe}, {:?}: {i} {:?}",
                 std::thread::current().id(),
                 *elem
             )
-        });
+        })
+        .block();
     world.barrier();
 
     let task = array
diff --git a/examples/array_examples/local_iteration.rs b/examples/array_examples/local_iteration.rs
index 5be210ad..c6b8edaf 100644
--- a/examples/array_examples/local_iteration.rs
+++ b/examples/array_examples/local_iteration.rs
@@ -23,8 +23,11 @@ fn main() {
     //for example lets initialize our arrays, where we store the value of my_pe to each local element a pe owns
     block_local_iter
         .enumerate()
-        .blocking_for_each(move |(i, elem)| elem.store(i));
-    cyclic_local_iter.blocking_for_each(move |elem| elem.store(my_pe));
+        .for_each(move |(i, elem)| elem.store(i))
+        .block();
+    cyclic_local_iter
+        .for_each(move |elem| elem.store(my_pe))
+        .block();
 
     // let block_array = block_array.into_read_only();
     block_array.print();
@@ -40,7 +43,7 @@ fn main() {
         .skip(2)
         .enumerate()
         .step_by(3)
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -48,7 +51,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
     block_array.barrier();
 
     println!("--------------------------------------------------------");
@@ -58,7 +62,7 @@ fn main() {
         .local_iter()
         .enumerate()
         .skip(2)
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -66,7 +70,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
     cyclic_array.barrier();
 
     println!("--------------------------------------------------------");
@@ -89,14 +94,15 @@ fn main() {
             );
             async move { (i, elem.load(), barray.load(i).await + elem.load()) }
         })
-        .blocking_for_each_async(move |i| async move {
+        .for_each_async(move |i| async move {
             println!(
                 "[pe({:?})-{:?}] {:?}",
                 my_pe,
                 std::thread::current().id(),
                 i.await
             );
-        });
+        })
+        .block();
     cyclic_array.barrier();
     block_array.print();
 
@@ -114,7 +120,7 @@ fn main() {
             );
             elem.load() % 4 == 0
         })
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -122,7 +128,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
     block_array.barrier();
 
     println!("--------------------------------------------------------");
@@ -137,7 +144,7 @@ fn main() {
                 None
             }
         })
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -145,7 +152,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
     block_array.barrier();
     // println!("--------------------------------------------------------");
     // println!("filter_map collect");
@@ -165,7 +173,7 @@ fn main() {
         .local_iter()
         .skip(10)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -173,7 +181,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     block_array.barrier();
 
@@ -184,7 +193,7 @@ fn main() {
         .skip(10)
         .step_by(3)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -192,7 +201,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     block_array.barrier();
 
@@ -203,7 +213,7 @@ fn main() {
         .take(60)
         .skip(10)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -211,7 +221,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     block_array.barrier();
 
@@ -223,7 +234,7 @@ fn main() {
         .skip(10)
         .take(30)
         .enumerate()
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
                 my_pe,
@@ -231,7 +242,8 @@ fn main() {
                 i,
                 elem
             )
-        });
+        })
+        .block();
 
     block_array.barrier();
 
diff --git a/examples/bandwidths/atomic_array_get_bw.rs b/examples/bandwidths/atomic_array_get_bw.rs
index 8d26bc4e..2515ae78 100644
--- a/examples/bandwidths/atomic_array_get_bw.rs
+++ b/examples/bandwidths/atomic_array_get_bw.rs
@@ -22,7 +22,8 @@ fn main() {
     }
     array
         .local_iter_mut()
-        .blocking_for_each(move |elem| *elem = num_pes as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, so use locallock for initializiation
+        .for_each(move |elem| *elem = num_pes as u8)
+        .block(); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, so use locallock for initializiation
     let array = array.into_atomic(); //this enforces a wait_all and barrier
                                      // array.wait_all();
                                      // array.barrier();
diff --git a/examples/bandwidths/atomic_array_put_bw.rs b/examples/bandwidths/atomic_array_put_bw.rs
index f05a24b3..26604649 100644
--- a/examples/bandwidths/atomic_array_put_bw.rs
+++ b/examples/bandwidths/atomic_array_put_bw.rs
@@ -22,7 +22,8 @@ fn main() {
     }
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| *elem = 255 as u8); //this is can be pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, local lock tends to perform better
+        .for_each(move |elem| *elem = 255 as u8)
+        .block(); //this is can be pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, local lock tends to perform better
     let mut array = array.into_atomic(); //so we simply convert the LocalLockArray array to atomic after initalization
 
     world.barrier();
@@ -104,7 +105,8 @@ fn main() {
         // };
         let temp = array.into_local_lock();
         temp.dist_iter_mut()
-            .blocking_for_each(move |elem| *elem = 255 as u8); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements
+            .for_each(move |elem| *elem = 255 as u8)
+            .block(); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements
         array = temp.into_atomic();
         world.barrier();
     }
diff --git a/examples/bandwidths/global_lock_atomic_array_get_bw.rs b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
index 9f1d1231..ca3110ee 100644
--- a/examples/bandwidths/global_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
@@ -26,8 +26,8 @@ fn main() {
     }
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| *elem = num_pes as u8);
-    array.wait_all();
+        .for_each(move |elem| *elem = num_pes as u8)
+        .block();
     array.barrier();
 
     world.barrier();
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index e7f33500..f3ecf0d4 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -23,8 +23,8 @@ fn main() {
     }
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| *elem = 255 as u8);
-    array.wait_all();
+        .for_each(move |elem| *elem = 255 as u8)
+        .block();
     array.barrier();
 
     world.barrier();
@@ -98,8 +98,8 @@ fn main() {
         bws.push((sum as f64 / 1048576.0) / cur_t);
         array
             .dist_iter_mut()
-            .blocking_for_each(move |elem| *elem = 255 as u8);
-        array.wait_all();
+            .for_each(move |elem| *elem = 255 as u8)
+            .block();
         array.barrier();
     }
     if my_pe == num_pes - 1 {
diff --git a/examples/bandwidths/local_lock_atomic_array_get_bw.rs b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
index c160c541..fae3fff5 100644
--- a/examples/bandwidths/local_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
@@ -26,8 +26,8 @@ fn main() {
     }
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| *elem = num_pes as u8);
-    array.wait_all();
+        .for_each(move |elem| *elem = num_pes as u8)
+        .block();
     array.barrier();
 
     world.barrier();
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index ce376976..69994ec9 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -23,8 +23,8 @@ fn main() {
     }
     array
         .dist_iter_mut()
-        .blocking_for_each(move |elem| *elem = 255 as u8);
-    array.wait_all();
+        .for_each(move |elem| *elem = 255 as u8)
+        .block();
     array.barrier();
 
     world.barrier();
@@ -98,8 +98,8 @@ fn main() {
         bws.push((sum as f64 / 1048576.0) / cur_t);
         array
             .dist_iter_mut()
-            .blocking_for_each(move |elem| *elem = 255 as u8);
-        array.wait_all();
+            .for_each(move |elem| *elem = 255 as u8)
+            .block();
         array.barrier();
     }
     if my_pe == 0 {
diff --git a/examples/bandwidths/readonly_array_get_bw.rs b/examples/bandwidths/readonly_array_get_bw.rs
index adeaecbc..bc12bf1c 100644
--- a/examples/bandwidths/readonly_array_get_bw.rs
+++ b/examples/bandwidths/readonly_array_get_bw.rs
@@ -21,10 +21,10 @@ fn main() {
         }
         array
             .dist_iter_mut()
-            .blocking_for_each(move |elem| *elem = num_pes as u8);
+            .for_each(move |elem| *elem = num_pes as u8)
+            .block();
     }
 
-    array.wait_all();
     array.barrier();
     let array = array.into_read_only();
 
diff --git a/examples/bandwidths/readonly_array_get_unchecked_bw.rs b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
index 3163808a..c63ad12c 100644
--- a/examples/bandwidths/readonly_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
@@ -25,9 +25,9 @@ fn main() {
     unsafe {
         array
             .local_iter_mut()
-            .blocking_for_each(move |elem| *elem = num_pes as u8);
+            .for_each(move |elem| *elem = num_pes as u8)
+            .block();
     }
-    array.wait_all();
     array.barrier();
     let array = array.into_read_only();
 
diff --git a/examples/bandwidths/unsafe_array_get_bw.rs b/examples/bandwidths/unsafe_array_get_bw.rs
index 0cf35d0d..a903191b 100644
--- a/examples/bandwidths/unsafe_array_get_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_bw.rs
@@ -22,9 +22,9 @@ fn main() {
 
         array
             .local_iter_mut()
-            .blocking_for_each(move |elem| *elem = num_pes as u8);
+            .for_each(move |elem| *elem = num_pes as u8)
+            .block();
     }
-    array.wait_all();
     array.barrier();
 
     world.barrier();
diff --git a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
index d64c7999..624ee25f 100644
--- a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
@@ -20,10 +20,9 @@ fn main() {
         }
         array
             .dist_iter_mut()
-            .blocking_for_each(move |elem| *elem = num_pes as u8);
+            .for_each(move |elem| *elem = num_pes as u8)
+            .block();
     }
-
-    array.wait_all();
     array.barrier();
 
     world.barrier();
diff --git a/examples/bandwidths/unsafe_array_store_bw.rs b/examples/bandwidths/unsafe_array_store_bw.rs
index 65aec25c..9a08182f 100644
--- a/examples/bandwidths/unsafe_array_store_bw.rs
+++ b/examples/bandwidths/unsafe_array_store_bw.rs
@@ -21,7 +21,8 @@ fn main() {
         }
         array
             .dist_iter_mut()
-            .blocking_for_each(move |elem| *elem = num_pes as u8);
+            .for_each(move |elem| *elem = num_pes as u8)
+            .block();
     }
     array.wait_all();
     array.barrier();
diff --git a/examples/hello_world/hello_world_array_iteration.rs b/examples/hello_world/hello_world_array_iteration.rs
index 80eff712..3857ecd2 100644
--- a/examples/hello_world/hello_world_array_iteration.rs
+++ b/examples/hello_world/hello_world_array_iteration.rs
@@ -28,7 +28,7 @@ fn main() {
     array
         .dist_iter_mut() //create a mutable distributed iterator (i.e. data parallel iteration, similar to Rayon par_iter())
         .enumerate() // enumeration with respect to the global array
-        .blocking_for_each(move |(i, elem)| {
+        .for_each(move |(i, elem)| {
             println!(
                 "PE {:?} setting  array[{:?}]={:?} using thread {:?}",
                 my_pe,
@@ -37,7 +37,8 @@ fn main() {
                 std::thread::current().id()
             );
             elem.store(my_pe); //"store" because this is an AtomicArray
-        });
+        })
+        .block();
 
     //wait for all pes to finish
     world.barrier();
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index 02711900..d766dc4c 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -338,7 +338,7 @@ fn dft_lamellar_array(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f64>) -> f
         spectrum
             .dist_iter_mut()
             .enumerate()
-            .blocking_for_each(move |(k, spec_bin)| {
+            .for_each(move |(k, spec_bin)| {
                 let mut sum = 0f64;
                 for (i, &x) in signal_clone
                     .buffered_onesided_iter(1000)
@@ -351,7 +351,8 @@ fn dft_lamellar_array(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f64>) -> f
                     sum = sum + twiddle * x;
                 }
                 *spec_bin = sum
-            });
+            })
+            .block();
     }
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
@@ -368,7 +369,7 @@ fn dft_lamellar_array_2(signal: ReadOnlyArray<f64>, spectrum: AtomicArray<f64>)
     let _ = spectrum
         .dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(k, spec_bin)| {
+        .for_each(move |(k, spec_bin)| {
             let mut sum = 0f64;
             for (i, &x) in signal_clone
                 .buffered_onesided_iter(1000)
@@ -381,7 +382,8 @@ fn dft_lamellar_array_2(signal: ReadOnlyArray<f64>, spectrum: AtomicArray<f64>)
                 sum = sum + twiddle * x;
             }
             spec_bin.store(sum);
-        });
+        })
+        .block();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -391,25 +393,23 @@ fn dft_lamellar_array_swapped(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f6
     let timer = Instant::now();
     let signal_len = signal.len();
 
-    let mut reqs = vec![];
     unsafe {
         for (i, x) in signal.onesided_iter().into_iter().enumerate() {
             let x = (*x).clone();
-            reqs.push(
-                spectrum
-                    .dist_iter_mut()
-                    .enumerate()
-                    .for_each(move |(k, spec_bin)| {
-                        let angle = -1f64 * (i * k) as f64 * 2f64 * std::f64::consts::PI
-                            / signal_len as f64;
-                        let twiddle = angle * (angle.cos() + angle * angle.sin());
-                        let _lock = LOCK.lock();
-                        *spec_bin += twiddle * x;
-                    }),
-            );
+            spectrum
+                .dist_iter_mut()
+                .enumerate()
+                .for_each(move |(k, spec_bin)| {
+                    let angle =
+                        -1f64 * (i * k) as f64 * 2f64 * std::f64::consts::PI / signal_len as f64;
+                    let twiddle = angle * (angle.cos() + angle * angle.sin());
+                    let _lock = LOCK.lock();
+                    *spec_bin += twiddle * x;
+                })
+                .spawn();
         }
     };
-    spectrum.block_on_all(reqs);
+    spectrum.wait_all();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -424,7 +424,6 @@ fn dft_lamellar_array_opt(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
-    let mut reqs = vec![];
     unsafe {
         signal
             .onesided_iter()
@@ -434,30 +433,30 @@ fn dft_lamellar_array_opt(
             .enumerate()
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
-                reqs.push(
-                    spectrum
-                        .dist_iter_mut()
-                        .enumerate()
-                        .for_each(move |(k, spec_bin)| {
-                            let mut sum = 0f64;
-                            for (j, &x) in signal
-                                .iter()
-                                .enumerate()
-                                .map(|(j, x)| (j + i * buf_size, x))
-                            {
-                                let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
-                                    / sig_len as f64;
-                                let twiddle = angle * (angle.cos() + angle * angle.sin());
-                                sum = sum + twiddle * x;
-                            }
-
-                            // let _lock = LOCK.lock();
-                            *spec_bin += sum;
-                        }),
-                );
+
+                spectrum
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each(move |(k, spec_bin)| {
+                        let mut sum = 0f64;
+                        for (j, &x) in signal
+                            .iter()
+                            .enumerate()
+                            .map(|(j, x)| (j + i * buf_size, x))
+                        {
+                            let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
+                                / sig_len as f64;
+                            let twiddle = angle * (angle.cos() + angle * angle.sin());
+                            sum = sum + twiddle * x;
+                        }
+
+                        // let _lock = LOCK.lock();
+                        *spec_bin += sum;
+                    })
+                    .spawn();
             });
     }
-    spectrum.block_on_all(reqs);
+    spectrum.wait_all();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -469,7 +468,6 @@ fn dft_lamellar_array_opt_test(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
-    let mut reqs = vec![];
     unsafe {
         signal
             .onesided_iter()
@@ -479,9 +477,10 @@ fn dft_lamellar_array_opt_test(
             .enumerate()
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
-                reqs.push(spectrum.dist_iter_mut().enumerate().for_each_with_schedule(
-                    Schedule::Dynamic,
-                    move |(k, spec_bin)| {
+                spectrum
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each_with_schedule(Schedule::Dynamic, move |(k, spec_bin)| {
                         let mut sum = 0f64;
                         for (j, &x) in signal
                             .iter()
@@ -496,11 +495,11 @@ fn dft_lamellar_array_opt_test(
 
                         // let _lock = LOCK.lock();
                         *spec_bin += sum;
-                    },
-                ));
+                    })
+                    .spawn();
             });
     }
-    spectrum.block_on_all(reqs);
+    spectrum.wait_all();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -513,7 +512,6 @@ fn dft_lamellar_array_opt_2(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
-    let mut reqs = vec![];
     signal
         .onesided_iter()
         .chunks(buf_size)
@@ -522,29 +520,29 @@ fn dft_lamellar_array_opt_2(
         .enumerate()
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
-            reqs.push(
-                spectrum
-                    .dist_iter_mut()
-                    .enumerate()
-                    .for_each(move |(k, mut spec_bin)| {
-                        let mut sum = 0f64;
-                        unsafe {
-                            for (j, &x) in signal
-                                .iter()
-                                .enumerate()
-                                .map(|(j, x)| (j + i * buf_size, x))
-                            {
-                                let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
-                                    / sig_len as f64;
-                                let twiddle = angle * (angle.cos() + angle * angle.sin());
-                                sum = sum + twiddle * x;
-                            }
+
+            spectrum
+                .dist_iter_mut()
+                .enumerate()
+                .for_each(move |(k, mut spec_bin)| {
+                    let mut sum = 0f64;
+                    unsafe {
+                        for (j, &x) in signal
+                            .iter()
+                            .enumerate()
+                            .map(|(j, x)| (j + i * buf_size, x))
+                        {
+                            let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
+                                / sig_len as f64;
+                            let twiddle = angle * (angle.cos() + angle * angle.sin());
+                            sum = sum + twiddle * x;
                         }
-                        spec_bin += sum;
-                    }),
-            );
+                    }
+                    spec_bin += sum;
+                })
+                .spawn();
         });
-    spectrum.block_on_all(reqs);
+    spectrum.wait_all();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -557,7 +555,6 @@ fn dft_lamellar_array_opt_3(
 ) -> f64 {
     let timer = Instant::now();
     let sig_len = signal.len();
-    let mut reqs = vec![];
     signal
         .onesided_iter()
         .chunks(buf_size)
@@ -566,30 +563,30 @@ fn dft_lamellar_array_opt_3(
         .enumerate()
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
-            reqs.push(
-                spectrum
-                    .dist_iter_mut() //this locks the LocalLockArray
-                    .enumerate()
-                    .for_each(move |(k, spec_bin)| {
-                        //we are accessing each element independently so free to mutate
-                        let mut sum = 0f64;
-                        unsafe {
-                            for (j, &x) in signal
-                                .iter()
-                                .enumerate()
-                                .map(|(j, x)| (j + i * buf_size, x))
-                            {
-                                let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
-                                    / sig_len as f64;
-                                let twiddle = angle * (angle.cos() + angle * angle.sin());
-                                sum = sum + twiddle * x;
-                            }
+
+            spectrum
+                .dist_iter_mut() //this locks the LocalLockArray
+                .enumerate()
+                .for_each(move |(k, spec_bin)| {
+                    //we are accessing each element independently so free to mutate
+                    let mut sum = 0f64;
+                    unsafe {
+                        for (j, &x) in signal
+                            .iter()
+                            .enumerate()
+                            .map(|(j, x)| (j + i * buf_size, x))
+                        {
+                            let angle = -1f64 * (j * k) as f64 * 2f64 * std::f64::consts::PI
+                                / sig_len as f64;
+                            let twiddle = angle * (angle.cos() + angle * angle.sin());
+                            sum = sum + twiddle * x;
                         }
-                        *spec_bin += sum;
-                    }),
-            );
+                    }
+                    *spec_bin += sum;
+                })
+                .spawn();
         });
-    spectrum.block_on_all(reqs);
+    spectrum.wait_all();
     spectrum.barrier();
     timer.elapsed().as_secs_f64()
 }
@@ -653,7 +650,8 @@ fn main() {
             full_signal_array
                 .dist_iter_mut()
                 .enumerate()
-                .blocking_for_each(move |(i, x)| *x = full_signal_clone.as_mut_slice().unwrap()[i]);
+                .for_each(move |(i, x)| *x = full_signal_clone.as_mut_slice().unwrap()[i])
+                .block();
             full_signal_array.barrier();
 
             partial_spectrum.put(my_pe, 0, full_spectrum.sub_region(0..array_len));
@@ -768,7 +766,8 @@ fn main() {
             unsafe {
                 full_spectrum_array
                     .dist_iter_mut()
-                    .blocking_for_each(|elem| *elem = 0.0);
+                    .for_each(|elem| *elem = 0.0)
+                    .block();
             }
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
@@ -816,7 +815,8 @@ fn main() {
             unsafe {
                 full_spectrum_array
                     .dist_iter_mut()
-                    .blocking_for_each(|elem| *elem = 0.0);
+                    .for_each(|elem| *elem = 0.0)
+                    .block();
             }
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
@@ -835,7 +835,8 @@ fn main() {
             unsafe {
                 full_spectrum_array
                     .dist_iter_mut()
-                    .blocking_for_each(|elem| *elem = 0.0);
+                    .for_each(|elem| *elem = 0.0)
+                    .block();
             }
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
@@ -883,7 +884,8 @@ fn main() {
             world.barrier();
             full_spectrum_array
                 .dist_iter_mut()
-                .blocking_for_each(|elem| elem.store(0.0));
+                .for_each(|elem| elem.store(0.0))
+                .block();
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
             // let timer = Instant::now();
@@ -910,7 +912,8 @@ fn main() {
             world.barrier();
             full_spectrum_array
                 .dist_iter_mut()
-                .blocking_for_each(|elem| *elem = 0.0);
+                .for_each(|elem| *elem = 0.0)
+                .block();
             full_spectrum_array.wait_all();
             full_spectrum_array.barrier();
             if my_pe == 0 {
diff --git a/examples/kernels/parallel_array_gemm.rs b/examples/kernels/parallel_array_gemm.rs
index 6f027b5f..25f8c374 100644
--- a/examples/kernels/parallel_array_gemm.rs
+++ b/examples/kernels/parallel_array_gemm.rs
@@ -35,10 +35,11 @@ fn main() {
     //initialize matrices
     a.dist_iter_mut()
         .enumerate()
-        .blocking_for_each(|(i, x)| *x = i as f32);
+        .for_each(|(i, x)| *x = i as f32)
+        .block();
     b.dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(i, x)| {
+        .for_each(move |(i, x)| {
             //need global index so use dist_iter
             //identity matrix
             let row = i / dim;
@@ -48,8 +49,9 @@ fn main() {
             } else {
                 *x = 0 as f32;
             }
-        });
-    c.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
+        })
+        .block();
+    c.dist_iter_mut().for_each(|x| x.store(0.0)).block();
 
     world.wait_all();
     world.barrier();
@@ -69,7 +71,7 @@ fn main() {
         .for_each(|(j, col)| {
             let col = col.clone();
             let c = c.clone();
-            let _ = a
+            a
                 // .local_iter() //LocalIterator (each pe will iterate through only its local data -- in parallel)
                 // .chunks(n) // chunk by the row size
                 .local_chunks(n)
@@ -81,7 +83,8 @@ fn main() {
                     //we know all updates to c are local so directly update the raw data
                     //we could also use:
                     //c.add(j+i*m,sum) -- but some overheads are introduce from PGAS calculations performed by the runtime, and since its all local updates we can avoid them
-                });
+                })
+                .spawn();
         });
 
     world.wait_all();
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index 3f948b36..cd345e23 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -42,10 +42,11 @@ fn main() {
                                                                          //initialize
     a.dist_iter_mut()
         .enumerate()
-        .blocking_for_each(|(i, x)| *x = i as f32);
+        .for_each(|(i, x)| *x = i as f32)
+        .block();
     b.dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(i, x)| {
+        .for_each(move |(i, x)| {
             //identity matrix
             let row = i / dim;
             let col = i % dim;
@@ -54,8 +55,9 @@ fn main() {
             } else {
                 *x = 0 as f32;
             }
-        });
-    c.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
+        })
+        .block();
+    c.dist_iter_mut().for_each(|x| x.store(0.0)).block();
     world.barrier();
     let a = a.into_read_only();
     let b = b.into_read_only();
@@ -78,11 +80,12 @@ fn main() {
     nblks_array
         .dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(g_i, x)| {
+        .for_each(move |(g_i, x)| {
             let i = g_i % (n_blks * n_blks);
             x.j = i / n_blks;
             x.k = i % n_blks
-        });
+        })
+        .block();
     let nblks_array = nblks_array.into_read_only();
 
     let start = std::time::Instant::now();
@@ -91,7 +94,7 @@ fn main() {
     let c_clone = c.clone();
     nblks_array
         .dist_iter()
-        .blocking_for_each_async(move |block| {
+        .for_each_async(move |block| {
             let b = b_clone.clone();
             let a: ReadOnlyArray<f32> = a_clone.clone();
             let c = c_clone.clone();
@@ -172,7 +175,8 @@ fn main() {
                 }
             }
             // }
-        });
+        })
+        .block();
     world.wait_all();
     world.barrier();
     let elapsed = start.elapsed().as_secs_f64();
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index a85f5ed7..29b5bdf1 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -48,10 +48,11 @@ fn main() {
         }
     });
     let c_init = c.dist_iter_mut().for_each(|x| *x = 0.0);
+    world.block_on_all([a_init, b_init, c_init]);
     let a = a.into_read_only();
     let b = b.into_read_only();
-    world.block_on_all((a, b, c));
-    worldc.barrier();
+
+    world.barrier();
 
     let num_gops = ((2 * dim * dim * dim) - dim * dim) as f64 / 1_000_000_000.0; // accurate for square matrices
     let blocksize = std::cmp::min(1000000, dim / num_pes); // / 32;
@@ -72,14 +73,16 @@ fn main() {
     nblks_array
         .dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(i, x)| *x = i % n_blks);
+        .for_each(move |(i, x)| *x = i % n_blks)
+        .block();
 
     let m_blks_pe_array = LocalLockArray::new(&world, m_blks_pe * num_pes, Distribution::Block);
 
     m_blks_pe_array
         .dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(i, x)| *x = i % m_blks_pe);
+        .for_each(move |(i, x)| *x = i % m_blks_pe)
+        .block();
     world.barrier();
     let nblks_array = nblks_array.into_read_only();
     let m_blks_pe_array = m_blks_pe_array.into_read_only();
diff --git a/examples/kernels/serial_array_gemm.rs b/examples/kernels/serial_array_gemm.rs
index b98f68da..1ecbfd4f 100644
--- a/examples/kernels/serial_array_gemm.rs
+++ b/examples/kernels/serial_array_gemm.rs
@@ -30,10 +30,11 @@ fn main() {
 
     a.dist_iter_mut()
         .enumerate()
-        .blocking_for_each(|(i, x)| *x = i as f32);
+        .for_each(|(i, x)| *x = i as f32)
+        .block();
     b.dist_iter_mut()
         .enumerate()
-        .blocking_for_each(move |(i, x)| {
+        .for_each(move |(i, x)| {
             //identity matrix
             let row = i / dim;
             let col = i % dim;
@@ -42,8 +43,9 @@ fn main() {
             } else {
                 *x = 0 as f32;
             }
-        });
-    c.dist_iter_mut().blocking_for_each(|x| x.store(0.0));
+        })
+        .block();
+    c.dist_iter_mut().for_each(|x| x.store(0.0)).block();
 
     world.barrier();
 
diff --git a/src/array.rs b/src/array.rs
index 143b45a6..b8af8f52 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -187,28 +187,28 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
-impl Dist for bool {}
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
-lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+// impl Dist for bool {}
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-// lamellar_impl::generate_ops_for_bool_rt!();
+lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 8544d140..a0f86480 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -43,7 +43,6 @@ use crate::array::{
 };
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
-
 use crate::active_messaging::SyncSend;
 
 use futures_util::Future;
@@ -53,7 +52,7 @@ use std::pin::Pin;
 use std::sync::Arc;
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ; [$(-> $($blocking_ret:tt)*)? ]) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ; [$($blocking_ret:tt)*]) => {
         fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($return_type)*
         where
            $($bounds)+
@@ -73,26 +72,47 @@ macro_rules! consumer_impl {
                 self.as_inner().[<$name _with_schedule>](sched, $($arg),*)
             }
 
-            fn [<blocking_ $name >]<$($generics),*>(
-                &self,
-                $($arg : $arg_ty),*
-            )   $(-> $($blocking_ret)*)?
-            where
-                $($bounds)+
-            {
-                self.as_inner().[<blocking_ $name >]($($arg),*)
-            }
-
-            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-                &self,
-                sched: Schedule,
-                $($arg : $arg_ty),*
-            )  $(-> $($blocking_ret)*)?
-            where
-                $($bounds)+
-            {
-                self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
-            }
+            // fn [<spawn_ $name >]<$($generics),*>(
+            //     &self,
+            //     $($arg : $arg_ty),*
+            // )   -> LamellarTask<$($blocking_ret)*>
+            // where
+            //     $($bounds)+
+            // {
+            //     self.as_inner().[<spawn_ $name >]($($arg),*)
+            // }
+
+            // fn [<spawn_ $name _with_schedule >]<$($generics),*>(
+            //     &self,
+            //     sched: Schedule,
+            //     $($arg : $arg_ty),*
+            // ) -> LamellarTask<$($blocking_ret)*>
+            // where
+            //     $($bounds)+
+            // {
+            //     self.as_inner().[<spawn_ $name _with_schedule>](sched, $($arg),*)
+            // }
+
+            // fn [<blocking_ $name >]<$($generics),*>(
+            //     &self,
+            //     $($arg : $arg_ty),*
+            // )   -> $($blocking_ret)*
+            // where
+            //     $($bounds)+
+            // {
+            //     self.as_inner().[<blocking_ $name >]($($arg),*)
+            // }
+
+            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+            //     &self,
+            //     sched: Schedule,
+            //     $($arg : $arg_ty),*
+            // )  -> $($blocking_ret)*
+            // where
+            //     $($bounds)+
+            // {
+            //     self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
+            // }
         }
     };
 }
@@ -103,43 +123,43 @@ pub trait DistIteratorLauncher: InnerArray {
         for_each<I, F>(iter: &I, op: F);
         [DistIterForEachHandle];
         [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
-        []
+        [()]
     );
     consumer_impl!(
         for_each_async<I, F, Fut>(iter: &I, op: F); 
         [DistIterForEachHandle];
         [I: DistributedIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
-        []);
+        [()]);
 
     consumer_impl!(
         reduce<I, F>(iter: &I, op: F); 
         [DistIterReduceHandle<I::Item, F>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
-        [-> Option<I::Item>]);
+        [Option<I::Item>]);
 
     consumer_impl!(
         collect<I, A>(iter: &I, d: Distribution); 
         [DistIterCollectHandle<I::Item, A>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static];
-        [-> A]);
+        [A]);
 
     consumer_impl!(
         collect_async<I, A, B>(iter: &I, d: Distribution); 
         [DistIterCollectHandle<B, A>];
         [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
-        [-> A]);
+        [A]);
 
     consumer_impl!(
         count<I>(iter: &I); 
         [DistIterCountHandle];
         [I: DistributedIterator + 'static ];
-        [-> usize]);
+        [usize]);
 
     consumer_impl!(
         sum<I>(iter: &I); 
         [DistIterSumHandle<I::Item>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
-        [-> I::Item]);
+        [I::Item]);
 
     //#[doc(hidden)]
     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
@@ -171,6 +191,7 @@ pub trait DistIteratorLauncher: InnerArray {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.as_inner().team()
     }
+
 }
 
 /// An interface for dealing with distributed iterators (intended as a parallel and distributed version of the standard iterator trait)
@@ -342,6 +363,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         Monotonic::new(self, 0)
     }
 
+    
+
     /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
@@ -349,8 +372,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
-    /// Note calling this function launches the iteration regardless of if the returned future is used or not.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterForEachHandle::spawn] or [blocked on][DistIterForEachHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -364,7 +387,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///         .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     /// );
     ///```
-    #[must_use]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn for_each<F>(&self, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -372,32 +395,65 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each(self, op)
     }
 
-    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array
-    ///
-    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    ///
-    /// The iteration will have been completed by the time this function returns
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// array
-    ///     .dist_iter()
-    ///     .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
-    /// );
-    ///```
-    fn blocking_for_each<F>(&self, op: F) 
-    where
-        F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_for_each(self, op)
-    }
+    // /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array
+    // ///
+    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// This function returns a future which can be used to poll for completion of the iteration.
+    // /// # Note 
+    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let _ = array
+    // ///     .dist_iter()
+    // ///     .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
+    // /// array.wait_all(); //wait for the iteration to complete
+    // /// 
+    // ///```
+    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
+    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
+    // somepoint to ensure the iteration has completed"]
+    // fn spawn_for_each<F>(&self, op: F) -> LamellarTask<()>
+    // where
+    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    // {
+    //     self.array().spawn_for_each(self, op)
+    // }
+
+    // /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array
+    // ///
+    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// The iteration will have been completed by the time this function returns
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array
+    // ///     .dist_iter()
+    // ///     .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
+    // /// );
+    // ///```
+    // fn blocking_for_each<F>(&self, op: F) 
+    // where
+    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_for_each(self, op)
+    // }
 
     /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
     ///
@@ -408,7 +464,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Each thread will only drive a single future at a time.
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterForEachHandle::spawn] or [blocked on][DistIterForEachHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -428,7 +485,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn for_each_async<F, Fut>(&self, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -437,48 +494,91 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array
-    ///
-    /// The supplied closure must return a future.
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// Iteration is completed by the time this function returns
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// array.dist_iter().blocking_for_each_async(|elem| async move {
-    ///     async_std::task::yield_now().await;
-    ///     println!("{:?} {elem}",std::thread::current().id())
-    /// });
-    /// ```
-    /// essentially the for_each_async call gets converted into (on each thread)
-    ///```ignore
-    /// for fut in array.iter(){
-    ///     fut.await;
-    /// }
-    ///```
-    fn blocking_for_each_async<F, Fut>(&self, op: F)
-    where
-        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.array().blocking_for_each_async(self, op)
-    }
+    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array
+    // ///
+    // /// The supplied closure must return a future.
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// This function returns a future which can be used to poll for completion of the iteration.
+    // /// # Note 
+    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let iter = array.dist_iter().spawn_for_each_async(|elem| async move {
+    // ///     async_std::task::yield_now().await;
+    // ///     println!("{:?} {elem}",std::thread::current().id())
+    // /// });
+    // /// world.block_on(iter);
+    // /// ```
+    // /// essentially the for_each_async call gets converted into (on each thread)
+    // ///```ignore
+    // /// for fut in array.iter(){
+    // ///     fut.await;
+    // /// }
+    // ///```
+    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
+    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
+    // somepoint to ensure the iteration has completed"]
+    // fn spawn_for_each_async<F, Fut>(&self, op: F) -> LamellarTask<()>
+    // where
+    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     self.array().spawn_for_each_async(self, op)
+    // }
+
+    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array
+    // ///
+    // /// The supplied closure must return a future.
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// Iteration is completed by the time this function returns
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.dist_iter().blocking_for_each_async(|elem| async move {
+    // ///     async_std::task::yield_now().await;
+    // ///     println!("{:?} {elem}",std::thread::current().id())
+    // /// });
+    // /// ```
+    // /// essentially the for_each_async call gets converted into (on each thread)
+    // ///```ignore
+    // /// for fut in array.iter(){
+    // ///     fut.await;
+    // /// }
+    // ///```
+    // fn blocking_for_each_async<F, Fut>(&self, op: F)
+    // where
+    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     self.array().blocking_for_each_async(self, op)
+    // }
 
     /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterForEachHandle::spawn] or [blocked on][DistIterForEachHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -486,10 +586,9 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
-    /// array.wait_all();
+    /// array.block_on(array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id())));
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -497,27 +596,55 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_with_schedule(sched, self, op)
     }
 
-    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array
-    ///
-    /// Iteration is completed by the time this function returns
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// array.dist_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
-    ///```
-    fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F)
-    where
-        F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_for_each_with_schedule(sched, self, op)
-    }
+    //  /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array
+    // ///
+    // /// This function returns a future which can be used to poll for completion of the iteration.
+    // /// # Note 
+    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
+    // /// array.wait_all();
+    // ///```
+    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
+    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
+    // somepoint to ensure the iteration has completed"]
+    // fn spawn_for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> LamellarTask<()>
+    // where
+    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    // {
+    //     self.array().spawn_for_each_with_schedule(sched, self, op)
+    // }
+
+    // /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array
+    // ///
+    // /// Iteration is completed by the time this function returns
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.dist_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
+    // ///```
+    // fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F)
+    // where
+    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_for_each_with_schedule(sched, self, op)
+    // }
 
     /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
@@ -529,7 +656,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Each thread will only drive a single future at a time.
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
-    /// Note calling this function launches the iteration regardless of if the returned future is used or not.
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterForEachHandle::spawn] or [blocked on][DistIterForEachHandle::block]
     ///
     /// # Examples
     ///```
@@ -538,13 +666,13 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// array.dist_iter().for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
+    /// let iter = array.dist_iter().spawn_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
     ///     async_std::task::yield_now().await;
     ///     println!("{:?} {elem}",std::thread::current().id())
     /// });
-    /// array.wait_all();
+    /// array.block_on(iter);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -553,43 +681,81 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async_with_schedule(sched, self, op)
     }
 
-    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
-    /// as each PE will only process elements local to itself
-    ///
-    /// The supplied closure must return a future.
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// Iteration is completed by the time this function returns
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// array.dist_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
-    ///     async_std::task::yield_now().await;
-    ///     println!("{:?} {elem}",std::thread::current().id())
-    /// });
-    ///```
-    fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
-    where
-        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.array().blocking_for_each_async_with_schedule(sched, self, op)
-    }
+    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
+    // /// as each PE will only process elements local to itself
+    // ///
+    // /// The supplied closure must return a future.
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// This function returns a future which can be used to poll for completion of the iteration.
+    // /// # Note 
+    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.dist_iter().spawn_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
+    // ///     async_std::task::yield_now().await;
+    // ///     println!("{:?} {elem}",std::thread::current().id())
+    // /// });
+    // /// array.wait_all();
+    // ///```
+    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
+    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
+    // somepoint to ensure the iteration has completed"]
+    // fn spawn_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> LamellarTask<()>
+    // where
+    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     self.array().spawn_for_each_async_with_schedule(sched, self, op)
+    // }
+
+    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
+    // /// as each PE will only process elements local to itself
+    // ///
+    // /// The supplied closure must return a future.
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// Iteration is completed by the time this function returns
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.dist_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
+    // ///     async_std::task::yield_now().await;
+    // ///     println!("{:?} {elem}",std::thread::current().id())
+    // /// });
+    // ///```
+    // fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
+    // where
+    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     self.array().blocking_for_each_async_with_schedule(sched, self, op)
+    // }
 
     /// Reduces the elements of the dist iterator using the provided closure
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
     ///
     /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterReduceHandle::spawn] or [blocked on][DistIterReduceHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -600,7 +766,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn reduce<F>(&self, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -610,33 +776,64 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().reduce(self, op)
     }
 
-    /// Reduces the elements of the dist iterator using the provided closure
-    ///
-    /// The function returns the reduced value
-    ///
-    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let req = array.dist_iter().blocking_reduce(|acc,elem| acc+elem);
-    ///```
-    fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
-    where
-        // &'static Self: LocalIterator + 'static,
-        Self::Item: Dist + ArrayOps,
-        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_reduce(self, op)
-    }
+    // /// Reduces the elements of the dist iterator using the provided closure
+    // ///
+    // /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
+    // ///
+    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    // /// # Note 
+    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let req = array.dist_iter().spawn_reduce(|acc,elem| acc+elem);
+    // /// let sum = array.block_on(req); //wait on the collect request to get the new array
+    // ///```
+    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
+    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
+    // somepoint to ensure the iteration has completed"]
+    // fn spawn_reduce<F>(&self, op: F) -> LamellarTask<Option<Self::Item>>
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    // {
+    //     self.array().spawn_reduce(self, op)
+    // }
+
+    // /// Reduces the elements of the dist iterator using the provided closure
+    // ///
+    // /// The function returns the reduced value
+    // ///
+    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let req = array.dist_iter().blocking_reduce(|acc,elem| acc+elem);
+    // ///```
+    // fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_reduce(self, op)
+    // }
 
     /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the  reduced value.
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterReduceHandle::spawn] or [blocked on][DistIterReduceHandle::block]
     ///
     /// # Examples
     ///```
@@ -648,7 +845,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -658,27 +855,51 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().reduce_with_schedule(sched, self, op)
     }
 
-    /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// This function returns the reduced value.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let req = array.dist_iter().blocking_reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);//wait on the collect request to get the new array
-    ///```
-    fn blocking_reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> Option<Self::Item>
-    where
-        // &'static Self: LocalIterator + 'static,
-        Self::Item: Dist + ArrayOps,
-        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_reduce_with_schedule(sched, self, op)
-    }
+    // /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// This function returns a future which needs to be driven to completion to retrieve the  reduced value.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let req = array.dist_iter().reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
+    // /// let sum = array.block_on(req); //wait on the collect request to get the new array
+    // ///```
+    // #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    // fn spawn_reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterReduceHandle<Self::Item, F>
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    // {
+    //     self.array().reduce_with_schedule(sched, self, op)
+    // }
+
+    // /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// This function returns the reduced value.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let req = array.dist_iter().blocking_reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);//wait on the collect request to get the new array
+    // ///```
+    // fn blocking_reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> Option<Self::Item>
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_reduce_with_schedule(sched, self, op)
+    // }
 
     /// Collects the elements of the distributed iterator into a new LamellarArray
     ///
@@ -692,7 +913,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// distribute data across the PEs as evenly as possible.
     ///
     /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterCollectHandle::spawn] or [blocked on][DistIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -706,7 +928,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn collect<A>(&self, d: Distribution) -> DistIterCollectHandle<Self::Item, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -716,38 +938,38 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect(self, d)
     }
 
-    /// Collects the elements of the distributed iterator into a new LamellarArray
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    ///
-    /// This function returns the new LamellarArray upon completion.
-    ///
-    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    /// distribute data across the PEs as evenly as possible.
-    ///
-    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let new_array = array.dist_iter()
-    ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
-    ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    ///                .blocking_collect::<AtomicArray<usize>>(Distribution::Block);
-    ///```
-    fn blocking_collect<A>(&self, d: Distribution) -> A
-    where
-        // &'static Self: DistributedIterator + 'static,
-        Self::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_collect(self, d)
-    }
+    // /// Collects the elements of the distributed iterator into a new LamellarArray
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    // ///
+    // /// This function returns the new LamellarArray upon completion.
+    // ///
+    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    // /// distribute data across the PEs as evenly as possible.
+    // ///
+    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let new_array = array.dist_iter()
+    // ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
+    // ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
+    // ///                .blocking_collect::<AtomicArray<usize>>(Distribution::Block);
+    // ///```
+    // fn blocking_collect<A>(&self, d: Distribution) -> A
+    // where
+    //     // &'static Self: DistributedIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect(self, d)
+    // }
 
     /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
     ///
@@ -759,7 +981,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
     /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
     /// distribute data across the PEs as evenly as possible.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterCollectHandle::spawn] or [blocked on][DistIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -773,7 +996,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn collect_with_schedule<A>(&self,sched: Schedule, d: Distribution) -> DistIterCollectHandle<Self::Item, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -783,37 +1006,37 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_with_schedule(sched,self,  d)
     }
 
-    /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    ///
-    /// This function returns the new LamellarArray upon completion.
-    ///
-    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    /// distribute data across the PEs as evenly as possible.
-    ///
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let new_array = array.dist_iter()
-    ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
-    ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    ///                .blocking_collect_with_scheduler::<AtomicArray<usize>>(Schedule::Dynamic, Distribution::Block);
-    ///```
-    fn blocking_collect_with_schedule<A>(&self,sched: Schedule, d: Distribution) -> A
-    where
-        // &'static Self: DistributedIterator + 'static,
-        Self::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_collect_with_schedule(sched,self, d)
-    }
+    // /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    // ///
+    // /// This function returns the new LamellarArray upon completion.
+    // ///
+    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    // /// distribute data across the PEs as evenly as possible.
+    // ///
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let new_array = array.dist_iter()
+    // ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
+    // ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
+    // ///                .blocking_collect_with_scheduler::<AtomicArray<usize>>(Schedule::Dynamic, Distribution::Block);
+    // ///```
+    // fn blocking_collect_with_schedule<A>(&self,sched: Schedule, d: Distribution) -> A
+    // where
+    //     // &'static Self: DistributedIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect_with_schedule(sched,self, d)
+    // }
 
     /// Collects the awaited elements of the distributed iterator into a new LamellarArray
     ///
@@ -829,7 +1052,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
     /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
     /// distribute data across the PEs as evenly as possible.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterCollectHandle::spawn] or [blocked on][DistIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -851,7 +1075,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn collect_async<A, T>(&self, d: Distribution) -> DistIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -862,49 +1086,49 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_async(self, d)
     }
 
-    /// Collects the awaited elements of the distributed iterator into a new LamellarArray
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    ///
-    /// Each element from the iterator must return a Future
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// The function returns the new LamellarArray upon completion.
-    ///
-    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    /// distribute data across the PEs as evenly as possible.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// // initialize a world and an atomic array
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    ///
-    /// // clone the array; this doesn't duplicate the underlying
-    /// // data but it does create a second pointer that we can
-    /// // discard when necessary
-    /// let array_clone = array.clone();
-    ///
-    /// // run collect
-    /// let _new_array
-    ///     = array_clone.dist_iter().map(
-    ///         move |elem|
-    ///         array_clone
-    ///             .fetch_add(elem.load(),1000))
-    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    ///```
-    fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
-    where
-        // &'static Self: DistributedIterator + 'static,
-        T: Dist + ArrayOps,
-        Self::Item: Future<Output = T> + Send + 'static,
-        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_collect_async(self, d)
-    }
+    // /// Collects the awaited elements of the distributed iterator into a new LamellarArray
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    // ///
+    // /// Each element from the iterator must return a Future
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// The function returns the new LamellarArray upon completion.
+    // ///
+    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    // /// distribute data across the PEs as evenly as possible.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// // initialize a world and an atomic array
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// // clone the array; this doesn't duplicate the underlying
+    // /// // data but it does create a second pointer that we can
+    // /// // discard when necessary
+    // /// let array_clone = array.clone();
+    // ///
+    // /// // run collect
+    // /// let _new_array
+    // ///     = array_clone.dist_iter().map(
+    // ///         move |elem|
+    // ///         array_clone
+    // ///             .fetch_add(elem.load(),1000))
+    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    // ///```
+    // fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
+    // where
+    //     // &'static Self: DistributedIterator + 'static,
+    //     T: Dist + ArrayOps,
+    //     Self::Item: Future<Output = T> + Send + 'static,
+    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect_async(self, d)
+    // }
 
     /// Collects the awaited elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
     ///
@@ -920,7 +1144,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
     /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
     /// distribute data across the PEs as evenly as possible.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterCollectHandle::spawn] or [blocked on][DistIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -942,7 +1167,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn collect_async_with_schedule<A, T>(&self, sched: Schedule,   d: Distribution) -> DistIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -953,56 +1178,57 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_async_with_schedule(sched, self, d)
     }
 
-    /// Collects the awaited elements of the distributed iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    ///
-    /// Each element from the iterator must return a Future
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// The function returns the new LamellarArray upon completion.
-    ///
-    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    /// distribute data across the PEs as evenly as possible.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// // initialize a world and an atomic array
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    ///
-    /// // clone the array; this doesn't duplicate the underlying
-    /// // data but it does create a second pointer that we can
-    /// // discard when necessary
-    /// let array_clone = array.clone();
-    ///
-    /// // run collect
-    /// let _new_array
-    ///     = array_clone.dist_iter().map(
-    ///         move |elem|
-    ///         array_clone
-    ///             .fetch_add(elem.load(),1000))
-    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    ///```
-    fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
-    where
-        // &'static Self: DistributedIterator + 'static,
-        T: Dist + ArrayOps,
-        Self::Item: Future<Output = T> + Send + 'static,
-        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_collect_async_with_schedule(sched,self, d)
-    }
+    // /// Collects the awaited elements of the distributed iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    // ///
+    // /// Each element from the iterator must return a Future
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// The function returns the new LamellarArray upon completion.
+    // ///
+    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    // /// distribute data across the PEs as evenly as possible.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// // initialize a world and an atomic array
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// // clone the array; this doesn't duplicate the underlying
+    // /// // data but it does create a second pointer that we can
+    // /// // discard when necessary
+    // /// let array_clone = array.clone();
+    // ///
+    // /// // run collect
+    // /// let _new_array
+    // ///     = array_clone.dist_iter().map(
+    // ///         move |elem|
+    // ///         array_clone
+    // ///             .fetch_add(elem.load(),1000))
+    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    // ///```
+    // fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
+    // where
+    //     // &'static Self: DistributedIterator + 'static,
+    //     T: Dist + ArrayOps,
+    //     Self::Item: Future<Output = T> + Send + 'static,
+    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect_async_with_schedule(sched,self, d)
+    // }
 
     /// Counts the number of the elements of the distriubted iterator
     /// 
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve count.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterCountHandle::spawn] or [blocked on][DistIterCountHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1013,36 +1239,37 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().filter(|elem|  elem < 10).count();
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn count(&self) -> DistIterCountHandle {
         self.array().count(self)
     }
 
-    /// Counts the number of the elements of the distributed iterator
-    ///
-    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    /// 
-    /// This function returns the count upon completion.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count();
-    ///```
-    fn blocking_count(&self) -> usize {
-        self.array().blocking_count(self)
-    }
+    // /// Counts the number of the elements of the distributed iterator
+    // ///
+    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    // /// 
+    // /// This function returns the count upon completion.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count();
+    // ///```
+    // fn blocking_count(&self) -> usize {
+    //     self.array().blocking_count(self)
+    // }
 
     /// Counts the number of the elements of the distriubted iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
     /// 
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve count.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterCountHandle::spawn] or [blocked on][DistIterCountHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1058,24 +1285,24 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     }
 
 
-    /// Counts the number of the elements of the distributed iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    /// 
-    /// This function returns the count upon completion.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count_with_schedule(Schedule::Dynamic);
-    ///```
-    fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
-        self.array().blocking_count_with_schedule(sched, self)
-    }
+    // /// Counts the number of the elements of the distributed iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    // /// 
+    // /// This function returns the count upon completion.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count_with_schedule(Schedule::Dynamic);
+    // ///```
+    // fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
+    //     self.array().blocking_count_with_schedule(sched, self)
+    // }
 
     /// Sums the elements of the distributed iterator.
     ///
@@ -1086,7 +1313,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// An empty iterator returns the zero value of the type.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the sum
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterSumHandle::spawn] or [blocked on][DistIterSumHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1097,7 +1325,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn sum(&self) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
@@ -1105,31 +1333,31 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().sum(self)
     }
 
-    /// Sums the elements of the distributed iterator.
-    ///
-    /// Takes each element, adds them together, and returns the result.
-    /// 
-    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    ///
-    /// An empty iterator returns the zero value of the type.
-    ///
-    /// This function returns the sum upon completion.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let sum = array.dist_iter().blocking_sum();
-    ///```
-    fn blocking_sum(&self) -> Self::Item
-    where
-        Self::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        self.array().blocking_sum(self)
-    }
+    // /// Sums the elements of the distributed iterator.
+    // ///
+    // /// Takes each element, adds them together, and returns the result.
+    // /// 
+    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    // ///
+    // /// An empty iterator returns the zero value of the type.
+    // ///
+    // /// This function returns the sum upon completion.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let sum = array.dist_iter().blocking_sum();
+    // ///```
+    // fn blocking_sum(&self) -> Self::Item
+    // where
+    //     Self::Item: Dist + ArrayOps + std::iter::Sum,
+    // {
+    //     self.array().blocking_sum(self)
+    // }
 
     /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
@@ -1140,7 +1368,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// An empty iterator returns the zero value of the type.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the sum
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][DistIterSumHandle::spawn] or [blocked on][DistIterSumHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1151,7 +1380,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn sum_with_schedule(&self, sched: Schedule) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
@@ -1159,31 +1388,31 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().sum_with_schedule(sched, self)
     }
 
-    /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// Takes each element, adds them together, and returns the result.
-    /// 
-    /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    ///
-    /// An empty iterator returns the zero value of the type.
-    ///
-    /// This function returns the sum upon completion.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let sum = array.dist_iter().blocking_sum_with_schedule(Schedule::Guided);
-    ///```
-    fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
-    where
-        Self::Item: Dist + ArrayOps + std::iter::Sum,
-    {
-        self.array().blocking_sum_with_schedule(sched, self)
-    }
+    // /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// Takes each element, adds them together, and returns the result.
+    // /// 
+    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
+    // ///
+    // /// An empty iterator returns the zero value of the type.
+    // ///
+    // /// This function returns the sum upon completion.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let sum = array.dist_iter().blocking_sum_with_schedule(Schedule::Guided);
+    // ///```
+    // fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
+    // where
+    //     Self::Item: Dist + ArrayOps + std::iter::Sum,
+    // {
+    //     self.array().blocking_sum_with_schedule(sched, self)
+    // }
 }
 
 /// An interface for dealing with distributed iterators which are indexable, meaning it returns an iterator of known length
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 4eecdc56..549b7c23 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -10,6 +10,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
+use crate::scheduler::LamellarTask;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
@@ -290,6 +291,12 @@ where
             state: State::Barrier(barrier_handle, inner),
         }
     }
+    pub fn block(self) -> A {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<A> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 7d0a3a78..a383d6fa 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -9,6 +9,7 @@ use crate::darc::DarcMode;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 use crate::Darc;
 
 use async_trait::async_trait;
@@ -211,6 +212,12 @@ impl DistIterCountHandle {
             state: State::Barrier(barrier_handle, inner),
         }
     }
+    pub fn block(self) -> usize {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<usize> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index 955c88fc..9c3b9e6e 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -7,6 +7,7 @@ use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -214,6 +215,12 @@ impl DistIterForEachHandle {
             state: State::Barrier(barrier, reqs),
         }
     }
+    pub fn block(self) {
+        self.team.clone().block_on(self);
+    }
+    pub fn spawn(self) -> LamellarTask<()> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
@@ -224,6 +231,7 @@ enum State {
     ),
     Reqs(#[pin] InnerDistIterForEachHandle),
 }
+
 impl Future for DistIterForEachHandle {
     type Output = ();
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 77dbd99c..7065481f 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -9,6 +9,7 @@ use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 use crate::Dist;
 
 use futures_util::{ready, Future, StreamExt};
@@ -315,16 +316,15 @@ where
 
 #[pin_project]
 pub struct DistIterReduceHandle<T, F> {
-    // pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
     team: Pin<Arc<LamellarTeamRT>>,
     #[pin]
     state: State<T, F>,
 }
 
 impl<T, F> DistIterReduceHandle<T, F>
-// where
-//     T: Dist + Send + ArrayOps,
-//     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+where
+    T: Dist + ArrayOps,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     pub(crate) fn new(
         barrier: BarrierHandle,
@@ -336,6 +336,13 @@ impl<T, F> DistIterReduceHandle<T, F>
             state: State::Barrier(barrier, reqs),
         }
     }
+
+    pub fn block(self) -> Option<T> {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<Option<T>> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index 0c39ca4b..d235c5f1 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -8,6 +8,7 @@ use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 use crate::Dist;
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -214,6 +215,13 @@ where
             state: State::Barrier(barrier_handle, inner),
         }
     }
+
+    pub fn block(self) -> T {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index e09ba9c7..126d1e75 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -71,26 +71,26 @@ macro_rules! consumer_impl {
                 self.as_inner().[<$name _with_schedule>](sched, $($arg),*)
             }
 
-            fn [<blocking_ $name >]<$($generics),*>(
-                &self,
-                $($arg : $arg_ty),*
-            )   $(-> $($blocking_ret)*)?
-            where
-                $($bounds)+
-            {
-                self.as_inner().[<blocking_ $name >]($($arg),*)
-            }
-
-            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-                &self,
-                sched: Schedule,
-                $($arg : $arg_ty),*
-            )  $(-> $($blocking_ret)*)?
-            where
-                $($bounds)+
-            {
-                self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
-            }
+            // fn [<blocking_ $name >]<$($generics),*>(
+            //     &self,
+            //     $($arg : $arg_ty),*
+            // )   $(-> $($blocking_ret)*)?
+            // where
+            //     $($bounds)+
+            // {
+            //     self.as_inner().[<blocking_ $name >]($($arg),*)
+            // }
+
+            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+            //     &self,
+            //     sched: Schedule,
+            //     $($arg : $arg_ty),*
+            // )  $(-> $($blocking_ret)*)?
+            // where
+            //     $($bounds)+
+            // {
+            //     self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
+            // }
         }
     };
 }
@@ -343,7 +343,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
     /// Note calling this function launches the iteration regardless of if the returned future is used or not.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterForEachHandle::spawn] or [blocked on][LocalIterForEachHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -357,7 +358,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///         .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     /// );
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each<F>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -365,37 +366,38 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each(self, op)
     }
 
-     /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
-    ///
-    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    ///
-    /// The iteration will be complete upon return from this function
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// 
-    ///     array
-    ///         .local_iter()
-    ///         .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
-    /// 
-    ///```
-    fn blocking_for_each<F>(&self, op: F)
-    where
-        F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_for_each(self, op)
-    }
+    //  /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
+    // ///
+    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// The iteration will be complete upon return from this function
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// 
+    // ///     array
+    // ///         .local_iter()
+    // ///         .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
+    // /// 
+    // ///```
+    // fn blocking_for_each<F>(&self, op: F)
+    // where
+    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_for_each(self, op)
+    // }
 
     /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Scehedule][crate::array::iterator::Schedule] policy.
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
     /// Note calling this function launches the iteration regardless of if the returned future is used or not.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterForEachHandle::spawn] or [blocked on][LocalIterForEachHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -406,7 +408,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// array.local_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
     /// array.wait_all();
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -414,25 +416,25 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_with_schedule(sched, self, op)
     }
 
-    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Scehedule][crate::array::iterator::Schedule] policy.
-    ///
-    /// The iteration will be complete upon return from this function
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// array.local_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
-    ///```
-    fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F) 
-    where
-        F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_for_each_with_schedule(sched, self, op)
-    }
+    // /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Scehedule][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// The iteration will be complete upon return from this function
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.local_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
+    // ///```
+    // fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F) 
+    // where
+    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_for_each_with_schedule(sched, self, op)
+    // }
 
     /// Calls a closure and immediately awaits the result on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
     ///
@@ -444,7 +446,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
     /// Note calling this function launches the iteration regardless of if the returned future is used or not.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterForEachHandle::spawn] or [blocked on][LocalIterForEachHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -464,7 +467,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_async<F, Fut>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -473,41 +476,41 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    /// Calls a closure and immediately awaits the result on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
-    ///
-    /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    ///
-    /// The supplied closure must return a future.
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// The iteration will have been completed by the time this function returns
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// array.local_iter().blocking_for_each_async(|elem| async move {
-    ///     async_std::task::yield_now().await;
-    ///     println!("{:?} {elem}",std::thread::current().id())
-    /// });
-    /// ```
-    /// essentially the for_each_async call gets converted into (on each thread)
-    ///```ignore
-    /// for fut in array.iter(){
-    ///     fut.await;
-    /// }
-    ///```
-    fn blocking_for_each_async<F, Fut>(&self, op: F) 
-    where
-        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.array().blocking_for_each_async(self, op)
-    }
+    // /// Calls a closure and immediately awaits the result on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
+    // ///
+    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// The supplied closure must return a future.
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// The iteration will have been completed by the time this function returns
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.local_iter().blocking_for_each_async(|elem| async move {
+    // ///     async_std::task::yield_now().await;
+    // ///     println!("{:?} {elem}",std::thread::current().id())
+    // /// });
+    // /// ```
+    // /// essentially the for_each_async call gets converted into (on each thread)
+    // ///```ignore
+    // /// for fut in array.iter(){
+    // ///     fut.await;
+    // /// }
+    // ///```
+    // fn blocking_for_each_async<F, Fut>(&self, op: F) 
+    // where
+    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     self.array().blocking_for_each_async(self, op)
+    // }
 
 
     /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
@@ -518,7 +521,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
     /// Note calling this function launches the iteration regardless of if the returned future is used or not.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterForEachHandle::spawn] or [blocked on][LocalIterForEachHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -532,7 +536,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// });
     /// array.wait_all();
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -541,38 +545,39 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async_with_schedule(sched, self, op)
     }
 
-    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    ///
-    /// The supplied closure must return a future.
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// The iteration will have been completed by the time this function returns
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// array.local_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
-    ///     async_std::task::yield_now().await;
-    ///     println!("{:?} {elem}",std::thread::current().id())
-    /// });
-    ///```
-    fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
-    where
-        F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-        Fut: Future<Output = ()> + Send + 'static,
-    {
-        self.array().blocking_for_each_async_with_schedule(sched, self, op)
-    }
+    // /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    // ///
+    // /// The supplied closure must return a future.
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// The iteration will have been completed by the time this function returns
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// array.local_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
+    // ///     async_std::task::yield_now().await;
+    // ///     println!("{:?} {elem}",std::thread::current().id())
+    // /// });
+    // ///```
+    // fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
+    // where
+    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
+    //     Fut: Future<Output = ()> + Send + 'static,
+    // {
+    //     self.array().blocking_for_each_async_with_schedule(sched, self, op)
+    // }
 
     /// Reduces the elements of the local iterator using the provided closure
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterReduceHandle::spawn] or [blocked on][LocalIterReduceHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -583,7 +588,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn reduce<F>(&self, op: F) -> LocalIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -593,32 +598,33 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().reduce(self, op)
     }
 
-    /// Reduces the elements of the local iterator using the provided closure
-    ///
-    /// This function returns the reduced value
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let sum  = array.blocking_local_iter().reduce(|acc,elem| acc+elem);
-    ///```
-    fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
-    where
-        // &'static Self: LocalIterator + 'static,
-        Self::Item: SyncSend + Copy,
-        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_reduce(self, op)
-    }
+    // /// Reduces the elements of the local iterator using the provided closure
+    // ///
+    // /// This function returns the reduced value
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let sum  = array.blocking_local_iter().reduce(|acc,elem| acc+elem);
+    // ///```
+    // fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: SyncSend + Copy,
+    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_reduce(self, op)
+    // }
 
     /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterReduceHandle::spawn] or [blocked on][LocalIterReduceHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -629,7 +635,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn reduce_with_schedule<F>(
         &self,
         sched: Schedule,
@@ -643,36 +649,37 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().reduce_with_schedule(sched, self, op)
     }
 
-    /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// This function returns the reduced value
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let sum = array.local_iter().blocking_reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
-    ///```
-    fn blocking_reduce_with_schedule<F>(
-        &self,
-        sched: Schedule,
-        op: F,
-    ) -> Option<Self::Item>
-    where
-        // &'static Self: LocalIterator + 'static,
-        Self::Item: SyncSend + Copy,
-        F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_reduce_with_schedule(sched, self, op)
-    }
+    // /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// This function returns the reduced value
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let sum = array.local_iter().blocking_reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
+    // ///```
+    // fn blocking_reduce_with_schedule<F>(
+    //     &self,
+    //     sched: Schedule,
+    //     op: F,
+    // ) -> Option<Self::Item>
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: SyncSend + Copy,
+    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_reduce_with_schedule(sched, self, op)
+    // }
 
     /// Collects the elements of the local iterator into the specified container type
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the new container.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterCollectHandle::spawn] or [blocked on][LocalIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -684,7 +691,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn collect<A>(&self, d: Distribution) -> LocalIterCollectHandle<Self::Item, A>
     where
         // &'static Self: LocalIterator + 'static,
@@ -694,10 +701,34 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect(self, d)
     }
 
-    /// Collects the elements of the local iterator into the specified container type
-    ///
-    /// This function returns the new container
+    // /// Collects the elements of the local iterator into the specified container type
+    // ///
+    // /// This function returns the new container
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let array_clone = array.clone();
+    // /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
+    // ///```
+    // fn blocking_collect<A>(&self, d: Distribution) ->A
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect(self, d)
+    // }
+
+    /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
+    /// This function returns a future which needs to be driven to completion to retrieve the new container.
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterCollectHandle::spawn] or [blocked on][LocalIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -706,73 +737,50 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
     ///
     /// let array_clone = array.clone();
-    /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
+    /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
+    /// let new_array = array.block_on(req);
     ///```
-    fn blocking_collect<A>(&self, d: Distribution) ->A
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    fn collect_with_schedule<A>(
+        &self,
+        sched: Schedule,
+        d: Distribution,
+    ) -> LocalIterCollectHandle<Self::Item, A>
     where
         // &'static Self: LocalIterator + 'static,
         Self::Item: Dist + ArrayOps,
         A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
-        self.array().blocking_collect(self, d)
+        self.array().collect_with_schedule(sched, self, d)
     }
 
-    /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// This function returns a future which needs to be driven to completion to retrieve the new container.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    ///
-    /// let array_clone = array.clone();
-    /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
-    /// let new_array = array.block_on(req);
-    ///```
-    #[must_use]
-    fn collect_with_schedule<A>(
-        &self,
-        sched: Schedule,
-        d: Distribution,
-    ) -> LocalIterCollectHandle<Self::Item, A>
-    where
-        // &'static Self: LocalIterator + 'static,
-        Self::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().collect_with_schedule(sched, self, d)
-    }
-
-    /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// This function returns the new container
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    ///
-    /// let array_clone = array.clone();
-    /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
-    ///
-    ///``
-    fn blocking_collect_with_schedule<A>(
-        &self,
-        sched: Schedule,
-        d: Distribution,
-    ) -> A
-    where
-        // &'static Self: LocalIterator + 'static,
-        Self::Item: Dist + ArrayOps,
-        A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_collect_with_schedule(sched, self, d)
-    }
+    // /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// This function returns the new container
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let array_clone = array.clone();
+    // /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
+    // ///
+    // ///``
+    // fn blocking_collect_with_schedule<A>(
+    //     &self,
+    //     sched: Schedule,
+    //     d: Distribution,
+    // ) -> A
+    // where
+    //     // &'static Self: LocalIterator + 'static,
+    //     Self::Item: Dist + ArrayOps,
+    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect_with_schedule(sched, self, d)
+    // }
 
     /// Collects the awaited elements of the local iterator into a new LamellarArray
     ///
@@ -788,7 +796,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
     /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
     /// distribute data across the PEs as evenly as possible.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterCollectHandle::spawn] or [blocked on][LocalIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -810,7 +819,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn collect_async<A, T>(&self, d: Distribution) -> LocalIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -821,49 +830,49 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_async(self, d)
     }
 
-    /// Collects the awaited elements of the local iterator into a new LamellarArray
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    ///
-    /// Each element from the iterator must return a Future
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// The function returns the new LamellarArray upon completion.
-    ///
-    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    /// distribute data across the PEs as evenly as possible.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// // initialize a world and an atomic array
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    ///
-    /// // clone the array; this doesn't duplicate the underlying
-    /// // data but it does create a second pointer that we can
-    /// // discard when necessary
-    /// let array_clone = array.clone();
-    ///
-    /// // run collect
-    /// let _new_array
-    ///     = array_clone.local_iter().map(
-    ///         move |elem|
-    ///         array_clone
-    ///             .fetch_add(elem.load(),1000))
-    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    ///```
-    fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
-    where
-        // &'static Self: DistributedIterator + 'static,
-        T: Dist + ArrayOps,
-        Self::Item: Future<Output = T> + Send + 'static,
-        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_collect_async(self, d)
-    }
+    // /// Collects the awaited elements of the local iterator into a new LamellarArray
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    // ///
+    // /// Each element from the iterator must return a Future
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// The function returns the new LamellarArray upon completion.
+    // ///
+    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    // /// distribute data across the PEs as evenly as possible.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// // initialize a world and an atomic array
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// // clone the array; this doesn't duplicate the underlying
+    // /// // data but it does create a second pointer that we can
+    // /// // discard when necessary
+    // /// let array_clone = array.clone();
+    // ///
+    // /// // run collect
+    // /// let _new_array
+    // ///     = array_clone.local_iter().map(
+    // ///         move |elem|
+    // ///         array_clone
+    // ///             .fetch_add(elem.load(),1000))
+    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    // ///```
+    // fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
+    // where
+    //     // &'static Self: DistributedIterator + 'static,
+    //     T: Dist + ArrayOps,
+    //     Self::Item: Future<Output = T> + Send + 'static,
+    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect_async(self, d)
+    // }
 
     /// Collects the awaited elements of the local iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
     ///
@@ -879,7 +888,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
     /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
     /// distribute data across the PEs as evenly as possible.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterCollectHandle::spawn] or [blocked on][LocalIterCollectHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -901,7 +911,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn collect_async_with_schedule<A, T>(&self, sched: Schedule,   d: Distribution) -> LocalIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -912,54 +922,55 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_async_with_schedule(sched, self, d)
     }
 
-    /// Collects the awaited elements of the local iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
-    ///
-    /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    ///
-    /// Each element from the iterator must return a Future
-    ///
-    /// Each thread will only drive a single future at a time.
-    ///
-    /// The function returns the new LamellarArray upon completion.
-    ///
-    /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    /// distribute data across the PEs as evenly as possible.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// // initialize a world and an atomic array
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    ///
-    /// // clone the array; this doesn't duplicate the underlying
-    /// // data but it does create a second pointer that we can
-    /// // discard when necessary
-    /// let array_clone = array.clone();
-    ///
-    /// // run collect
-    /// let _new_array
-    ///     = array_clone.local_iter().map(
-    ///         move |elem|
-    ///         array_clone
-    ///             .fetch_add(elem.load(),1000))
-    ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    ///```
-    fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
-    where
-        // &'static Self: DistributedIterator + 'static,
-        T: Dist + ArrayOps,
-        Self::Item: Future<Output = T> + Send + 'static,
-        A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    {
-        self.array().blocking_collect_async_with_schedule(sched,self, d)
-    }
+    // /// Collects the awaited elements of the local iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    // ///
+    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
+    // ///
+    // /// Each element from the iterator must return a Future
+    // ///
+    // /// Each thread will only drive a single future at a time.
+    // ///
+    // /// The function returns the new LamellarArray upon completion.
+    // ///
+    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
+    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
+    // /// distribute data across the PEs as evenly as possible.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// // initialize a world and an atomic array
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// // clone the array; this doesn't duplicate the underlying
+    // /// // data but it does create a second pointer that we can
+    // /// // discard when necessary
+    // /// let array_clone = array.clone();
+    // ///
+    // /// // run collect
+    // /// let _new_array
+    // ///     = array_clone.local_iter().map(
+    // ///         move |elem|
+    // ///         array_clone
+    // ///             .fetch_add(elem.load(),1000))
+    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    // ///```
+    // fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
+    // where
+    //     // &'static Self: DistributedIterator + 'static,
+    //     T: Dist + ArrayOps,
+    //     Self::Item: Future<Output = T> + Send + 'static,
+    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
+    // {
+    //     self.array().blocking_collect_async_with_schedule(sched,self, d)
+    // }
 
     /// Counts the number of the elements of the local iterator
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the number of elements in the local iterator
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterCountHandle::spawn] or [blocked on][LocalIterCountHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -970,32 +981,33 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().count();
     /// let cnt = array.block_on(req);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn count(&self) -> LocalIterCountHandle {
         self.array().count(self)
     }
 
-    /// Counts the number of the elements of the local iterator
-    ///
-    /// This returns the number of elements in the local iterator
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let cnt = array.local_iter().blocking_count();
-    ///```
-    fn blocking_count(&self) -> usize {
-        self.array().blocking_count(self)
-    }
+    // /// Counts the number of the elements of the local iterator
+    // ///
+    // /// This returns the number of elements in the local iterator
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let cnt = array.local_iter().blocking_count();
+    // ///```
+    // fn blocking_count(&self) -> usize {
+    //     self.array().blocking_count(self)
+    // }
 
     /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the number of elements in the local iterator
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterCountHandle::spawn] or [blocked on][LocalIterCountHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1006,27 +1018,27 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().count_with_schedule(Schedule::Dynamic);
     /// let cnt = array.block_on(req);
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn count_with_schedule(&self, sched: Schedule) -> LocalIterCountHandle {
         self.array().count_with_schedule(sched, self)
     }
 
-    /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// This returns the number of elements in the local iterator
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let cnt = array.local_iter().blocking_count_with_schedule(Schedule::Dynamic);
-    ///```
-    fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
-        self.array().blocking_count_with_schedule(sched, self)
-    }
+    // /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// This returns the number of elements in the local iterator
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let cnt = array.local_iter().blocking_count_with_schedule(Schedule::Dynamic);
+    // ///```
+    // fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
+    //     self.array().blocking_count_with_schedule(sched, self)
+    // }
 
 
     /// Sums the elements of the local iterator.
@@ -1036,7 +1048,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// An empty iterator returns the zero value of the type.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the sum
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterSumHandle::spawn] or [blocked on][LocalIterSumHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1047,7 +1060,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-    #[must_use]
+     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn sum(&self) -> LocalIterSumHandle<Self::Item>
     where
         Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
@@ -1055,29 +1068,29 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().sum(self)
     }
 
-    /// Sums the elements of the local iterator.
-    ///
-    /// Takes each element, adds them together, and returns the result.
-    ///
-    /// An empty iterator returns the zero value of the type.
-    ///
-    /// This function the sum upon completion.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let req = array.local_iter().blocking_sum();
-    ///```
-    fn blocking_sum(&self) -> Self::Item
-    where
-        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
-    {
-        self.array().blocking_sum(self)
-    }
+    // /// Sums the elements of the local iterator.
+    // ///
+    // /// Takes each element, adds them together, and returns the result.
+    // ///
+    // /// An empty iterator returns the zero value of the type.
+    // ///
+    // /// This function the sum upon completion.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let req = array.local_iter().blocking_sum();
+    // ///```
+    // fn blocking_sum(&self) -> Self::Item
+    // where
+    //     Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
+    // {
+    //     self.array().blocking_sum(self)
+    // }
 
     /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
@@ -1086,7 +1099,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// An empty iterator returns the zero value of the type.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the sum
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalIterSumHandle::spawn] or [blocked on][LocalIterSumHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1097,7 +1111,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req);
     ///```
-    #[must_use]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn sum_with_schedule(&self, sched: Schedule) -> LocalIterSumHandle<Self::Item>
     where
         Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
@@ -1105,29 +1119,29 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().sum_with_schedule(sched, self)
     }
 
-    /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
-    ///
-    /// Takes each element, adds them together, and returns the result.
-    ///
-    /// An empty iterator returns the zero value of the type.
-    ///
-    /// This function returns the sum upon completion.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    ///
-    /// let sum = array.local_iter().blocking_sum_with_schedule(Schedule::Guided);
-    ///```
-    fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
-    where
-        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
-    {
-        self.array().blocking_sum_with_schedule(sched, self)
-    }
+    // /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
+    // ///
+    // /// Takes each element, adds them together, and returns the result.
+    // ///
+    // /// An empty iterator returns the zero value of the type.
+    // ///
+    // /// This function returns the sum upon completion.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // ///
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    // ///
+    // /// let sum = array.local_iter().blocking_sum_with_schedule(Schedule::Guided);
+    // ///```
+    // fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
+    // where
+    //     Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
+    // {
+    //     self.array().blocking_sum_with_schedule(sched, self)
+    // }
 }
 
 /// An interface for dealing with local iterators which are indexable, meaning it returns an iterator of known length
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 0c0e35da..6ca03f95 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -9,6 +9,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
+use crate::scheduler::LamellarTask;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
@@ -284,6 +285,13 @@ where
             state: State::Init(inner),
         }
     }
+
+    pub fn block(self) -> A {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<A> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index cde28ba8..2fe94ca2 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -5,6 +5,7 @@ use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -137,6 +138,13 @@ impl LocalIterCountHandle {
             state: State::Init(inner),
         }
     }
+
+    pub fn block(self) -> usize {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<usize> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index 3fbfd437..c99dc7a5 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -6,6 +6,7 @@ use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -216,6 +217,13 @@ impl LocalIterForEachHandle {
             state: State::Init(reqs),
         }
     }
+
+    pub fn block(self) {
+        self.team.clone().block_on(self);
+    }
+    pub fn spawn(self) -> LamellarTask<()> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index dd44db36..dcc53bd2 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -6,6 +6,7 @@ use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -150,7 +151,11 @@ pub struct LocalIterReduceHandle<T, F> {
     state: State<T, F>,
 }
 
-impl<T, F> LocalIterReduceHandle<T, F> {
+impl<T, F> LocalIterReduceHandle<T, F>
+where
+    T: SyncSend + Copy + 'static,
+    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
+{
     pub(crate) fn new(
         reqs: Pin<Box<dyn Future<Output = InnerLocalIterReduceHandle<T, F>> + Send>>,
         array: &UnsafeArrayInner,
@@ -160,6 +165,13 @@ impl<T, F> LocalIterReduceHandle<T, F> {
             state: State::Init(reqs),
         }
     }
+
+    pub fn block(self) -> Option<T> {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<Option<T>> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index d85d924c..2d7e0a76 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -6,6 +6,7 @@ use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
+use crate::scheduler::LamellarTask;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -140,7 +141,10 @@ pub struct LocalIterSumHandle<T> {
     state: State<T>,
 }
 
-impl<T> LocalIterSumHandle<T> {
+impl<T> LocalIterSumHandle<T>
+where
+    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+{
     pub(crate) fn new(
         inner: Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>,
         array: &UnsafeArrayInner,
@@ -150,6 +154,13 @@ impl<T> LocalIterSumHandle<T> {
             state: State::Init(inner),
         }
     }
+
+    pub fn block(self) -> T {
+        self.team.clone().block_on(self)
+    }
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.team.clone().scheduler.spawn_task(self)
+    }
 }
 
 #[pin_project(project = StateProj)]
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 4e8e06f8..732b9672 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -3,9 +3,6 @@ use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::private::Sealed;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, InnerArray};
-use crate::lamellar_request::LamellarRequest;
-use crate::env_var::config;
-
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
@@ -31,7 +28,7 @@ impl InnerArray for UnsafeArrayInner {
 impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {}
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [$(-> $($blocking_ret:tt)*)?] ) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [ $($blocking_ret:tt)*] ) => {
         paste! {
             fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $return_type$(<$($ret_gen),*>)?
             where
@@ -63,49 +60,69 @@ macro_rules! consumer_impl {
                 $return_type::new(barrier,reqs_future,self)
             }
 
-            fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) $(-> $($blocking_ret)*)?
-            where
-            $($bounds)+
-            {
-
-                self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
-            }
-
-
-            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-                &self,
-                sched: Schedule,
-                $($arg : $arg_ty),*
-            ) $(-> $($blocking_ret)*)?
-            where
-                $($bounds)+
-            {
-                if std::thread::current().id() != *crate::MAIN_THREAD {
-                    let name = stringify!{$name};
-                    let msg = format!("
-                        [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
-                        Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
-                    );
-                    if let Some(val) = config().blocking_call_warning {
-                        if val {
-                            println!("{msg}");
-                        }
-                    } else {
-                        println!("{msg}");
-                    }
-                }
-                let am = $($am)*;
-                self.data.team.barrier.tasking_barrier();
-                let inner = self.clone();
-                let reqs = match sched {
-                    Schedule::Static => inner.sched_static(am),
-                    Schedule::Dynamic => inner.sched_dynamic(am),
-                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
-                    Schedule::Guided => inner.sched_guided(am),
-                    Schedule::WorkStealing => inner.sched_work_stealing(am),
-                };
-                reqs.blocking_wait()
-            }
+            // fn [<spawn_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) -> LamellarTask<$($blocking_ret)*>
+            // where
+            // $($bounds)+
+            // {
+
+            //     self.[<spawn_ $name _with_schedule>](Schedule::Static, $($arg),*)
+            // }
+
+
+            // fn [<spawn_ $name _with_schedule >]<$($generics),*>(
+            //     &self,
+            //     sched: Schedule,
+            //     $($arg : $arg_ty),*
+            // ) -> LamellarTask<$($blocking_ret)*>
+            // where
+            //     $($bounds)+
+            // {
+            //     self.data.team.scheduler.spawn_task(self.[<$name _with_schedule>](sched, $($arg),*))
+            // }
+
+            // fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($blocking_ret)*
+            // where
+            // $($bounds)+
+            // {
+
+            //     self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
+            // }
+
+
+            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+            //     &self,
+            //     sched: Schedule,
+            //     $($arg : $arg_ty),*
+            // ) -> $($blocking_ret)*
+            // where
+            //     $($bounds)+
+            // {
+            //     if std::thread::current().id() != *crate::MAIN_THREAD {
+            //         let name = stringify!{$name};
+            //         let msg = format!("
+            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
+            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            //         );
+            //         if let Some(val) = config().blocking_call_warning {
+            //             if val {
+            //                 println!("{msg}");
+            //             }
+            //         } else {
+            //             println!("{msg}");
+            //         }
+            //     }
+            //     let am = $($am)*;
+            //     self.data.team.barrier.tasking_barrier();
+            //     let inner = self.clone();
+            //     let reqs = match sched {
+            //         Schedule::Static => inner.sched_static(am),
+            //         Schedule::Dynamic => inner.sched_dynamic(am),
+            //         Schedule::Chunk(size) => inner.sched_chunk(am,size),
+            //         Schedule::Guided => inner.sched_guided(am),
+            //         Schedule::WorkStealing => inner.sched_work_stealing(am),
+            //     };
+            //     reqs.blocking_wait()
+            // }
         }
     };
 }
@@ -128,7 +145,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
             Some(self.subarray_index_from_local(index * chunk_size)? / chunk_size)
         }
     }
-
+    
     consumer_impl!(
     for_each<I, F>(iter: &I, op: F); 
     [DistIterForEachHandle];
@@ -139,7 +156,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
             op,
         }
     ];
-    []);
+    [()]);
 
     consumer_impl!(
         for_each_async<I, F, Fut>(iter: &I, op: F); 
@@ -151,7 +168,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 op,
             }
         ];
-        []
+        [()]
     );
 
     consumer_impl!(
@@ -164,7 +181,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 op,
             }
         ];
-        [-> Option<I::Item>]);
+        [Option<I::Item>]);
 
     consumer_impl!(
         collect<I, A>( iter: &I, d: Distribution); 
@@ -177,7 +194,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 _phantom: PhantomData,
             }
         ];
-        [-> A]);
+        [A]);
     consumer_impl!(
         collect_async<I, A, B>( iter: &I, d: Distribution); 
         [DistIterCollectHandle<B, A>];
@@ -189,7 +206,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 _phantom: PhantomData,
             }
         ];
-        [-> A]);
+        [A]);
 
     consumer_impl!(
         count<I>( iter: &I); 
@@ -200,7 +217,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
             }
         ];
-        [-> usize]);
+        [usize]);
 
     consumer_impl!(
         sum<I>(iter: &I); 
@@ -211,7 +228,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
             }
         ];
-        [-> I::Item]);
+        [I::Item]);
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 8e5ed072..7ef57f5f 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -3,8 +3,6 @@ use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::private::*;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
-use crate::lamellar_request::LamellarRequest;
-use crate::env_var::config;
 
 use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
@@ -50,48 +48,48 @@ macro_rules! consumer_impl {
                 $return_type::new(reqs_future,self)
             }
 
-            fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) $(-> $($blocking_ret)*)?
-            where
-            $($bounds)+
-            {
-
-                self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
-            }
-
-
-            fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-                &self,
-                sched: Schedule,
-                $($arg : $arg_ty),*
-            ) $(-> $($blocking_ret)*)?
-            where
-                $($bounds)+
-            {
-                if std::thread::current().id() != *crate::MAIN_THREAD {
-                    let name = stringify!{$name};
-                    let msg = format!("
-                        [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
-                        Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
-                    );
-                    if let Some(val) = config().blocking_call_warning {
-                        if val {
-                            println!("{msg}");
-                        }
-                    } else {
-                        println!("{msg}");
-                    }
-                }
-                let am = $($am)*;
-                let inner = self.clone();
-                let reqs = match sched {
-                    Schedule::Static => inner.sched_static(am),
-                    Schedule::Dynamic => inner.sched_dynamic(am),
-                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
-                    Schedule::Guided => inner.sched_guided(am),
-                    Schedule::WorkStealing => inner.sched_work_stealing(am),
-                };
-                reqs.blocking_wait()
-            }
+            // fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) $(-> $($blocking_ret)*)?
+            // where
+            // $($bounds)+
+            // {
+
+            //     self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
+            // }
+
+
+            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
+            //     &self,
+            //     sched: Schedule,
+            //     $($arg : $arg_ty),*
+            // ) $(-> $($blocking_ret)*)?
+            // where
+            //     $($bounds)+
+            // {
+            //     if std::thread::current().id() != *crate::MAIN_THREAD {
+            //         let name = stringify!{$name};
+            //         let msg = format!("
+            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
+            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            //         );
+            //         if let Some(val) = config().blocking_call_warning {
+            //             if val {
+            //                 println!("{msg}");
+            //             }
+            //         } else {
+            //             println!("{msg}");
+            //         }
+            //     }
+            //     let am = $($am)*;
+            //     let inner = self.clone();
+            //     let reqs = match sched {
+            //         Schedule::Static => inner.sched_static(am),
+            //         Schedule::Dynamic => inner.sched_dynamic(am),
+            //         Schedule::Chunk(size) => inner.sched_chunk(am,size),
+            //         Schedule::Guided => inner.sched_guided(am),
+            //         Schedule::WorkStealing => inner.sched_work_stealing(am),
+            //     };
+            //     reqs.blocking_wait()
+            // }
         }
     };
 }
diff --git a/src/scheduler.rs b/src/scheduler.rs
index d6cec689..e9111120 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -8,7 +8,7 @@ use crate::lamellae::{Des, Lamellae, SerializedData};
 
 use enum_dispatch::enum_dispatch;
 use futures_util::Future;
-use pin_project::{pin_project, pinned_drop};
+use pin_project::pin_project;
 use std::pin::{pin, Pin};
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
@@ -113,7 +113,6 @@ pub(crate) enum LamellarTaskInner<T> {
     AsyncStdTask(async_std::task::JoinHandle<T>),
     #[cfg(feature = "tokio-executor")]
     TokioTask(tokio::task::JoinHandle<T>),
-    Dropped,
 }
 
 impl<T> Drop for LamellarTaskInner<T> {
@@ -128,7 +127,6 @@ impl<T> Drop for LamellarTaskInner<T> {
             LamellarTaskInner::AsyncStdTask(_task) => {}
             #[cfg(feature = "tokio-executor")]
             LamellarTaskInner::TokioTask(task) => {}
-            LamellarTaskInner::Dropped => {}
         }
     }
 }
@@ -148,7 +146,6 @@ impl<T> Future for LamellarTaskInner<T> {
                 LamellarTaskInner::AsyncStdTask(task) => Pin::new_unchecked(task).poll(cx),
                 #[cfg(feature = "tokio-executor")]
                 LamellarTaskInner::TokioTask(task) => Pin::new_unchecked(task).poll(cx),
-                LamellarTaskInner::Dropped => unreachable!(),
             }
         }
     }
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 96f7bfe7..ab6f102d 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -10,24 +10,28 @@ macro_rules! initialize_array {
         unsafe {
             $array
                 .dist_iter_mut()
-                .blocking_for_each(move |x| *x = $init_val)
+                .for_each(move |x| *x = $init_val)
+                .block()
         };
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
         $array
             .dist_iter()
-            .blocking_for_each(move |x| x.store($init_val));
+            .for_each(move |x| x.store($init_val))
+            .block();
         // println!("----------------------------------------------");
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
         $array
             .dist_iter_mut()
-            .blocking_for_each(move |x| *x = $init_val);
+            .for_each(move |x| *x = $init_val)
+            .block();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
         $array
             .dist_iter_mut()
-            .blocking_for_each(move |x| *x = $init_val);
+            .for_each(move |x| *x = $init_val)
+            .block();
     };
 }
 
@@ -332,16 +336,15 @@ macro_rules! input_test{
                     let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {
                         // println!("i: {:?}",i);
                         *x = i%array_total_len}
-                    );
+                    ).block();
                 }
                 else{
                     let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {
                         //println!("i: {:?}",i);
                         *x = i/num_pes}
-                    );
+                    ).block();
                 }
             }
-            input_array.wait_all();
             input_array.barrier();
             input_array.print();
             //individual T------------------------------
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index da85201c..d3250255 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -2,24 +2,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -58,7 +66,7 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 04ec3ce8..489c7026 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -8,26 +8,34 @@ use rand::distributions::Uniform;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
-        $array.wait_all();
+        unsafe {
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block()
+        };
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array
+        $array
             .dist_iter()
             .enumerate()
-            .for_each(move |(_i, x)| x.store($init_val));
-        $array.wait_all();
+            .for_each(move |(_i, x)| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -309,37 +317,40 @@ macro_rules! fetch_add_test{
 macro_rules! initialize_array2 {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         #[allow(unused_unsafe)]
-        let _ = unsafe {
+        unsafe {
             $array
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i)
+                .block()
         };
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().enumerate().for_each(move |(i, x)| {
-            // println!("{:?} {:?}", i, x.load());
-            x.store(i)
-        });
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .enumerate()
+            .for_each(move |(i, x)| {
+                // println!("{:?} {:?}", i, x.load());
+                x.store(i)
+            })
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array
+        $array
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i);
-        $array.wait_all();
+            .for_each(move |(i, x)| *x = i)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array
+        $array
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i);
-        $array.wait_all();
+            .for_each(move |(i, x)| *x = i)
+            .block();
         $array.barrier();
     };
 }
@@ -425,14 +436,12 @@ macro_rules! input_test{
             initialize_array2!($array, array, init_val);
             if $dist == lamellar::array::Distribution::Block{
                 #[allow(unused_unsafe)]
-                let _ = unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i%array_total_len})};
+                unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i%array_total_len}).block()};
             }
             else{
                 #[allow(unused_unsafe)]
-                let _ = unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i/num_pes})};
+                unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i/num_pes}).block()};
             }
-
-            array.wait_all();
             array.barrier();
             //individual T------------------------------
             let mut reqs = vec![];
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index 57b46924..17d136ff 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -2,28 +2,40 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
-        $array.wait_all();
+        let _ = unsafe {
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block()
+        };
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (GenericAtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -84,12 +96,12 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index 900c0d2f..8862733e 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -2,23 +2,33 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
-        $array.wait_all();
+        unsafe {
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
+        };
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -81,12 +91,12 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/arithmetic_ops/fetch_rem_test.rs b/tests/array/arithmetic_ops/fetch_rem_test.rs
index 1f0801c7..e25b819f 100644
--- a/tests/array/arithmetic_ops/fetch_rem_test.rs
+++ b/tests/array/arithmetic_ops/fetch_rem_test.rs
@@ -2,28 +2,40 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
-        $array.wait_all();
+        unsafe {
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
+        };
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (GenericAtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -84,12 +96,12 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index da690674..af985cf2 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -6,24 +6,32 @@ use rand::distributions::Uniform;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -78,12 +86,12 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index 4ae33385..c6abedca 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -2,24 +2,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -66,12 +74,12 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/arithmetic_ops/rem_test.rs b/tests/array/arithmetic_ops/rem_test.rs
index 5eb1a9b1..9d4cf3ed 100644
--- a/tests/array/arithmetic_ops/rem_test.rs
+++ b/tests/array/arithmetic_ops/rem_test.rs
@@ -2,24 +2,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -58,12 +66,12 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index 899d1b68..f216540f 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -5,23 +5,33 @@ use rand::distributions::Uniform;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = unsafe { $array.dist_iter_mut().for_each(move |x| *x = $init_val) };
-        $array.wait_all();
+        let _ = unsafe {
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block()
+        };
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -60,12 +70,12 @@ macro_rules! max_updates {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/atomic_ops/compare_exchange_test.rs b/tests/array/atomic_ops/compare_exchange_test.rs
index d2173a17..278a304b 100644
--- a/tests/array/atomic_ops/compare_exchange_test.rs
+++ b/tests/array/atomic_ops/compare_exchange_test.rs
@@ -2,23 +2,31 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -209,8 +217,6 @@ macro_rules! compare_exchange_test{
     }
 }
 
-
-
 macro_rules! compare_exchange_epsilon_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index 8d300a18..3e84dfea 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -2,23 +2,31 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
diff --git a/tests/array/atomic_ops/swap_test.rs b/tests/array/atomic_ops/swap_test.rs
index bef4220b..49b1afb9 100644
--- a/tests/array/atomic_ops/swap_test.rs
+++ b/tests/array/atomic_ops/swap_test.rs
@@ -2,23 +2,31 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -47,8 +55,6 @@ macro_rules! check_val {
     };
 }
 
-
-
 macro_rules! swap{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 68e740dc..2006fcad 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -3,24 +3,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -49,7 +57,7 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
@@ -58,8 +66,6 @@ macro_rules! onesided_iter{
     };
 }
 
-
-
 macro_rules! and_test{
     ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
        {
diff --git a/tests/array/bitwise_ops/fetch_and_test.rs b/tests/array/bitwise_ops/fetch_and_test.rs
index c41007bb..3fbb760a 100644
--- a/tests/array/bitwise_ops/fetch_and_test.rs
+++ b/tests/array/bitwise_ops/fetch_and_test.rs
@@ -3,24 +3,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -49,12 +57,12 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/bitwise_ops/fetch_or_test.rs b/tests/array/bitwise_ops/fetch_or_test.rs
index 183ab086..6b220433 100644
--- a/tests/array/bitwise_ops/fetch_or_test.rs
+++ b/tests/array/bitwise_ops/fetch_or_test.rs
@@ -3,24 +3,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -49,12 +57,12 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/bitwise_ops/fetch_xor_test.rs b/tests/array/bitwise_ops/fetch_xor_test.rs
index 1c5bcfb6..0f203699 100644
--- a/tests/array/bitwise_ops/fetch_xor_test.rs
+++ b/tests/array/bitwise_ops/fetch_xor_test.rs
@@ -3,24 +3,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -49,12 +57,12 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index 83e19d61..37c87dad 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -3,24 +3,32 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
-        $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -49,12 +57,12 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/bitwise_ops/xor_test.rs b/tests/array/bitwise_ops/xor_test.rs
index 2a10eee8..55902542 100644
--- a/tests/array/bitwise_ops/xor_test.rs
+++ b/tests/array/bitwise_ops/xor_test.rs
@@ -3,24 +3,30 @@ use lamellar::array::prelude::*;
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array.dist_iter_mut().for_each(move |x| *x = $init_val);
         }
         $array.wait_all();
         $array.barrier();
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
-        $array.wait_all();
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
         $array.barrier();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
-        $array.wait_all();
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
         $array.barrier();
     };
 }
@@ -49,13 +55,12 @@ macro_rules! check_val {
     };
 }
 
-
-macro_rules! onesided_iter{
+macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array.blocking_read_lock().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
-       $array.onesided_iter()
+        $array.onesided_iter()
     };
 }
 
diff --git a/tests/array/rdma/blocking_get_test.rs b/tests/array/rdma/blocking_get_test.rs
index 74217ca8..7d41d5e9 100644
--- a/tests/array/rdma/blocking_get_test.rs
+++ b/tests/array/rdma/blocking_get_test.rs
@@ -24,29 +24,30 @@ fn initialize_mem_region<T: Dist + std::ops::AddAssign>(
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$t:ty) => {
         unsafe {
-            let _ = $array
+            $array
                 .dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
+                .for_each(move |(i, x)| *x = i as $t)
+                .block()
         }
         $array.wait_all();
     };
     (AtomicArray,$array:ident,$t:ty) => {
-        let _ = $array
+        $array
             .dist_iter()
             .enumerate()
             .for_each(move |(i, x)| x.store(i as $t));
         $array.wait_all();
     };
     (LocalLockArray,$array:ident,$t:ty) => {
-        let _ = $array
+        $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
         $array.wait_all();
     };
     (GlobalLockArray,$array:ident,$t:ty) => {
-        let _ = $array
+        $array
             .dist_iter_mut()
             .enumerate()
             .for_each(move |(i, x)| *x = i as $t);
@@ -55,12 +56,11 @@ macro_rules! initialize_array {
     (ReadOnlyArray,$array:ident,$t:ty) => {
         let temp = $array.into_unsafe();
         unsafe {
-            let _ = temp
-                .dist_iter_mut()
+            temp.dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
+                .for_each(move |(i, x)| *x = i as $t)
+                .block();
         }
-        temp.wait_all();
         $array = temp.into_read_only();
     };
 }
@@ -69,47 +69,47 @@ macro_rules! initialize_array_range {
     (UnsafeArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
         unsafe {
-            let _ = subarray
+            subarray
                 .dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
+                .for_each(move |(i, x)| *x = i as $t)
+                .block();
         }
-        subarray.wait_all();
     }};
     (AtomicArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        let _ = subarray
+        subarray
             .dist_iter()
             .enumerate()
-            .for_each(move |(i, x)| x.store(i as $t));
-        subarray.wait_all();
+            .for_each(move |(i, x)| x.store(i as $t))
+            .block();
     }};
     (LocalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        let _ = subarray
+        subarray
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i as $t);
-        subarray.wait_all();
+            .for_each(move |(i, x)| *x = i as $t)
+            .block();
     }};
     (GlobalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        let _ = subarray
+        subarray
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i as $t);
-        subarray.wait_all();
+            .for_each(move |(i, x)| *x = i as $t)
+            .block();
     }};
     (ReadOnlyArray,$array:ident,$t:ty,$range:expr) => {{
         let temp = $array.into_unsafe();
         let subarray = temp.sub_array($range);
         unsafe {
-            let _ = subarray
+            subarray
                 .dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
+                .for_each(move |(i, x)| *x = i as $t)
+                .block();
         }
-        subarray.wait_all();
         drop(subarray);
         $array = temp.into_read_only();
     }};
diff --git a/tests/array/rdma/get_test.rs b/tests/array/rdma/get_test.rs
index 886ecd6a..b03ca058 100644
--- a/tests/array/rdma/get_test.rs
+++ b/tests/array/rdma/get_test.rs
@@ -18,44 +18,43 @@ fn initialize_mem_region<T: Dist + std::ops::AddAssign>(
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$t:ty) => {
         unsafe {
-            let _ = $array
+            $array
                 .dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
-            $array.wait_all();
+                .for_each(move |(i, x)| *x = i as $t)
+                .block();
         }
     };
     (AtomicArray,$array:ident,$t:ty) => {
-        let _ = $array
+        $array
             .dist_iter()
             .enumerate()
-            .for_each(move |(i, x)| x.store(i as $t));
-        $array.wait_all();
+            .for_each(move |(i, x)| x.store(i as $t))
+            .block();
     };
     (LocalLockArray,$array:ident,$t:ty) => {
-        let _ = $array
+        $array
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i as $t);
-        $array.wait_all();
+            .for_each(move |(i, x)| *x = i as $t)
+            .block();
     };
     (GlobalLockArray,$array:ident,$t:ty) => {
-        let _ = $array
+        $array
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i as $t);
-        $array.wait_all();
+            .for_each(move |(i, x)| *x = i as $t)
+            .block();
     };
     (ReadOnlyArray,$array:ident,$t:ty) => {
         // println!("into unsafe");
         let temp = $array.into_unsafe();
         // println!("unsafe");
         unsafe {
-            let _ = temp
-                .dist_iter_mut()
+            temp.dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
-            temp.wait_all();
+                .for_each(move |(i, x)| *x = i as $t)
+                .block();
             $array = temp.into_read_only();
         }
     };
@@ -65,36 +64,36 @@ macro_rules! initialize_array_range {
     (UnsafeArray,$array:ident,$t:ty,$range:expr) => {{
         unsafe {
             let subarray = $array.sub_array($range);
-            let _ = subarray
+            subarray
                 .dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
-            subarray.wait_all();
+                .for_each(move |(i, x)| *x = i as $t)
+                .block();
         }
     }};
     (AtomicArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        let _ = subarray
+        subarray
             .dist_iter()
             .enumerate()
-            .for_each(move |(i, x)| x.store(i as $t));
-        subarray.wait_all();
+            .for_each(move |(i, x)| x.store(i as $t))
+            .block();
     }};
     (LocalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        let _ = subarray
+        subarray
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i as $t);
-        subarray.wait_all();
+            .for_each(move |(i, x)| *x = i as $t)
+            .block();
     }};
     (GlobalLockArray,$array:ident,$t:ty,$range:expr) => {{
         let subarray = $array.sub_array($range);
-        let _ = subarray
+        subarray
             .dist_iter_mut()
             .enumerate()
-            .for_each(move |(i, x)| *x = i as $t);
-        subarray.wait_all();
+            .for_each(move |(i, x)| *x = i as $t)
+            .block();
     }};
     (ReadOnlyArray,$array:ident,$t:ty,$range:expr) => {{
         // println!("into unsafe");
@@ -102,12 +101,11 @@ macro_rules! initialize_array_range {
         // println!("unsafe");
         unsafe {
             let subarray = temp.sub_array($range);
-            let _ = subarray
+            subarray
                 .dist_iter_mut()
                 .enumerate()
-                .for_each(move |(i, x)| *x = i as $t);
-
-            subarray.wait_all();
+                .for_each(move |(i, x)| *x = i as $t)
+                .block();
             drop(subarray);
         }
         println!("into read only");
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index 204a8d81..12c21a08 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -18,17 +18,29 @@ fn initialize_mem_region<T: Dist + std::ops::AddAssign>(
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
         unsafe {
-            let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+            $array
+                .dist_iter_mut()
+                .for_each(move |x| *x = $init_val)
+                .block();
         }
     };
     (AtomicArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter().for_each(move |x| x.store($init_val));
+        $array
+            .dist_iter()
+            .for_each(move |x| x.store($init_val))
+            .block();
     };
     (LocalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
     };
     (GlobalLockArray,$array:ident,$init_val:ident) => {
-        let _ = $array.dist_iter_mut().for_each(move |x| *x = $init_val);
+        $array
+            .dist_iter_mut()
+            .for_each(move |x| *x = $init_val)
+            .block();
     };
 }
 

From 64e78a1ab0dd24d223edf159b2d0b7ed97a797bd Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 25 Jul 2024 13:14:59 -0700
Subject: [PATCH 056/116] impl ActiveMessaging for arrays

---
 examples/array_examples/array_put_get.rs     |  15 +-
 src/active_messaging.rs                      |   3 +-
 src/array.rs                                 | 353 +++++++++++++++----
 src/array/atomic.rs                          |  99 +++++-
 src/array/generic_atomic.rs                  |  82 ++++-
 src/array/generic_atomic/rdma.rs             |   6 +-
 src/array/global_lock_atomic.rs              | 103 ++++--
 src/array/global_lock_atomic/rdma.rs         |  14 +-
 src/array/local_lock_atomic.rs               | 100 ++++--
 src/array/local_lock_atomic/local_chunks.rs  |   4 +-
 src/array/local_lock_atomic/rdma.rs          |   6 +-
 src/array/native_atomic.rs                   |  82 ++++-
 src/array/native_atomic/rdma.rs              |   6 +-
 src/array/read_only.rs                       | 138 ++++----
 src/array/unsafe.rs                          | 158 +++++++--
 src/array/unsafe/iteration/distributed.rs    |   2 +-
 src/array/unsafe/iteration/local.rs          |   2 +-
 src/array/unsafe/rdma.rs                     |  10 +-
 src/barrier.rs                               |   2 +-
 src/darc.rs                                  |   4 +-
 src/darc/global_rw_darc.rs                   |  10 +-
 src/darc/local_rw_darc.rs                    |   8 +-
 src/lamellar_task_group.rs                   |   7 +-
 src/lamellar_team.rs                         |  29 +-
 src/lamellar_world.rs                        |   3 +-
 tests/array/arithmetic_ops/fetch_div_test.rs |   2 +-
 26 files changed, 946 insertions(+), 302 deletions(-)

diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index 2a06bcce..85c38674 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -1,10 +1,9 @@
 use lamellar::array::prelude::*;
 use lamellar::memregion::prelude::*;
 
-fn initialize_array(array: &UnsafeArray<usize>) {
-    unsafe { array.dist_iter_mut().for_each(|x| *x = 0).block() };
-    array.wait_all();
-    array.barrier();
+async fn initialize_array(array: &UnsafeArray<usize>) {
+    unsafe { array.dist_iter_mut().for_each(|x| *x = 0).await };
+    array.async_barrier().await;
 }
 
 fn initialize_mem_region(memregion: &LamellarMemoryRegion<usize>) {
@@ -32,8 +31,8 @@ fn main() {
         let cyclic_array = UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Cyclic);
         let shared_mem_region = world.alloc_shared_mem_region(total_len).into(); //Convert into abstract LamellarMemoryRegion
         let local_mem_region = world.alloc_one_sided_mem_region(total_len).into();
-        initialize_array(&block_array);
-        initialize_array(&cyclic_array);
+        initialize_array(&block_array).await;
+        initialize_array(&cyclic_array).await;
         initialize_mem_region(&shared_mem_region);
         initialize_mem_region(&local_mem_region);
         println!("data initialized");
@@ -84,8 +83,8 @@ fn main() {
             println!("put elapsed {:?}", start.elapsed().as_secs_f64());
             world.async_barrier().await;
 
-            initialize_array(&block_array);
-            initialize_array(&cyclic_array);
+            initialize_array(&block_array).await;
+            initialize_array(&cyclic_array).await;
             // can use subregions
 
             block_array.print();
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 033870bd..6f012acc 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -632,6 +632,7 @@
 //!```
 //! Other than the addition of `#[AmData(static)]` the rest of the code as the previous example would be the same.
 
+use crate::barrier::BarrierHandle;
 use crate::darc::__NetworkDarc;
 use crate::lamellae::{Lamellae, LamellaeRDMA, SerializedData};
 use crate::lamellar_arch::IdError;
@@ -1195,7 +1196,7 @@ pub trait ActiveMessaging {
     ///     world_clone.async_barrier().await; //block until all PEs have entered the barrier
     /// });
     ///```
-    fn async_barrier(&self) -> impl Future<Output = ()> + Send;
+    fn async_barrier(&self) -> BarrierHandle;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Spawns a future on the worker threadpool
diff --git a/src/array.rs b/src/array.rs
index b8af8f52..e6eba6de 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -63,6 +63,7 @@
 //! // export to Vec<usize>
 //! let vec = array.local_data().to_vec();
 //! ```
+use crate::barrier::BarrierHandle;
 use crate::lamellar_env::LamellarEnv;
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion,
@@ -71,7 +72,9 @@ use crate::memregion::{
     LamellarMemoryRegion,
     RegisteredMemoryRegion, // RemoteMemoryRegion,
 };
+use crate::scheduler::LamellarTask;
 use crate::{active_messaging::*, LamellarTeam, LamellarTeamRT};
+
 // use crate::Darc;
 use async_trait::async_trait;
 use enum_dispatch::enum_dispatch;
@@ -663,6 +666,121 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarReadArray
     }
 }
 
+impl<T: Dist> ActiveMessaging for LamellarReadArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.exec_am_all(am),
+            LamellarReadArray::ReadOnlyArray(array) => array.exec_am_all(am),
+            LamellarReadArray::AtomicArray(array) => array.exec_am_all(am),
+            LamellarReadArray::LocalLockArray(array) => array.exec_am_all(am),
+            LamellarReadArray::GlobalLockArray(array) => array.exec_am_all(am),
+        }
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.exec_am_pe(pe, am),
+            LamellarReadArray::ReadOnlyArray(array) => array.exec_am_pe(pe, am),
+            LamellarReadArray::AtomicArray(array) => array.exec_am_pe(pe, am),
+            LamellarReadArray::LocalLockArray(array) => array.exec_am_pe(pe, am),
+            LamellarReadArray::GlobalLockArray(array) => array.exec_am_pe(pe, am),
+        }
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.exec_am_local(am),
+            LamellarReadArray::ReadOnlyArray(array) => array.exec_am_local(am),
+            LamellarReadArray::AtomicArray(array) => array.exec_am_local(am),
+            LamellarReadArray::LocalLockArray(array) => array.exec_am_local(am),
+            LamellarReadArray::GlobalLockArray(array) => array.exec_am_local(am),
+        }
+    }
+    fn wait_all(&self) {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.wait_all(),
+            LamellarReadArray::ReadOnlyArray(array) => array.wait_all(),
+            LamellarReadArray::AtomicArray(array) => array.wait_all(),
+            LamellarReadArray::LocalLockArray(array) => array.wait_all(),
+            LamellarReadArray::GlobalLockArray(array) => array.wait_all(),
+        }
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        let fut: Pin<Box<dyn Future<Output = ()> + Send>> = match self {
+            LamellarReadArray::UnsafeArray(array) => Box::pin(array.await_all()),
+            LamellarReadArray::ReadOnlyArray(array) => Box::pin(array.await_all()),
+            LamellarReadArray::AtomicArray(array) => Box::pin(array.await_all()),
+            LamellarReadArray::LocalLockArray(array) => Box::pin(array.await_all()),
+            LamellarReadArray::GlobalLockArray(array) => Box::pin(array.await_all()),
+        };
+        fut
+    }
+    fn barrier(&self) {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.barrier(),
+            LamellarReadArray::ReadOnlyArray(array) => array.barrier(),
+            LamellarReadArray::AtomicArray(array) => array.barrier(),
+            LamellarReadArray::LocalLockArray(array) => array.barrier(),
+            LamellarReadArray::GlobalLockArray(array) => array.barrier(),
+        }
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.async_barrier(),
+            LamellarReadArray::ReadOnlyArray(array) => array.async_barrier(),
+            LamellarReadArray::AtomicArray(array) => array.async_barrier(),
+            LamellarReadArray::LocalLockArray(array) => array.async_barrier(),
+            LamellarReadArray::GlobalLockArray(array) => array.async_barrier(),
+        }
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.spawn(f),
+            LamellarReadArray::ReadOnlyArray(array) => array.spawn(f),
+            LamellarReadArray::AtomicArray(array) => array.spawn(f),
+            LamellarReadArray::LocalLockArray(array) => array.spawn(f),
+            LamellarReadArray::GlobalLockArray(array) => array.spawn(f),
+        }
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.block_on(f),
+            LamellarReadArray::ReadOnlyArray(array) => array.block_on(f),
+            LamellarReadArray::AtomicArray(array) => array.block_on(f),
+            LamellarReadArray::LocalLockArray(array) => array.block_on(f),
+            LamellarReadArray::GlobalLockArray(array) => array.block_on(f),
+        }
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.block_on_all(iter),
+            LamellarReadArray::ReadOnlyArray(array) => array.block_on_all(iter),
+            LamellarReadArray::AtomicArray(array) => array.block_on_all(iter),
+            LamellarReadArray::LocalLockArray(array) => array.block_on_all(iter),
+            LamellarReadArray::GlobalLockArray(array) => array.block_on_all(iter),
+        }
+    }
+}
+
 /// Represents the array types that allow write  operations
 #[enum_dispatch]
 #[derive(serde::Serialize, serde::Deserialize, Clone)]
@@ -699,6 +817,111 @@ impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarWriteArra
     }
 }
 
+impl<T: Dist> ActiveMessaging for LamellarWriteArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.exec_am_all(am),
+            LamellarWriteArray::AtomicArray(array) => array.exec_am_all(am),
+            LamellarWriteArray::LocalLockArray(array) => array.exec_am_all(am),
+            LamellarWriteArray::GlobalLockArray(array) => array.exec_am_all(am),
+        }
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.exec_am_pe(pe, am),
+            LamellarWriteArray::AtomicArray(array) => array.exec_am_pe(pe, am),
+            LamellarWriteArray::LocalLockArray(array) => array.exec_am_pe(pe, am),
+            LamellarWriteArray::GlobalLockArray(array) => array.exec_am_pe(pe, am),
+        }
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.exec_am_local(am),
+            LamellarWriteArray::AtomicArray(array) => array.exec_am_local(am),
+            LamellarWriteArray::LocalLockArray(array) => array.exec_am_local(am),
+            LamellarWriteArray::GlobalLockArray(array) => array.exec_am_local(am),
+        }
+    }
+    fn wait_all(&self) {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.wait_all(),
+            LamellarWriteArray::AtomicArray(array) => array.wait_all(),
+            LamellarWriteArray::LocalLockArray(array) => array.wait_all(),
+            LamellarWriteArray::GlobalLockArray(array) => array.wait_all(),
+        }
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        let fut: Pin<Box<dyn Future<Output = ()> + Send>> = match self {
+            LamellarWriteArray::UnsafeArray(array) => Box::pin(array.await_all()),
+            LamellarWriteArray::AtomicArray(array) => Box::pin(array.await_all()),
+            LamellarWriteArray::LocalLockArray(array) => Box::pin(array.await_all()),
+            LamellarWriteArray::GlobalLockArray(array) => Box::pin(array.await_all()),
+        };
+        fut
+    }
+    fn barrier(&self) {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.barrier(),
+            LamellarWriteArray::AtomicArray(array) => array.barrier(),
+            LamellarWriteArray::LocalLockArray(array) => array.barrier(),
+            LamellarWriteArray::GlobalLockArray(array) => array.barrier(),
+        }
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.async_barrier(),
+            LamellarWriteArray::AtomicArray(array) => array.async_barrier(),
+            LamellarWriteArray::LocalLockArray(array) => array.async_barrier(),
+            LamellarWriteArray::GlobalLockArray(array) => array.async_barrier(),
+        }
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.spawn(f),
+            LamellarWriteArray::AtomicArray(array) => array.spawn(f),
+            LamellarWriteArray::LocalLockArray(array) => array.spawn(f),
+            LamellarWriteArray::GlobalLockArray(array) => array.spawn(f),
+        }
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.block_on(f),
+            LamellarWriteArray::AtomicArray(array) => array.block_on(f),
+            LamellarWriteArray::LocalLockArray(array) => array.block_on(f),
+            LamellarWriteArray::GlobalLockArray(array) => array.block_on(f),
+        }
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.block_on_all(iter),
+            LamellarWriteArray::AtomicArray(array) => array.block_on_all(iter),
+            LamellarWriteArray::LocalLockArray(array) => array.block_on_all(iter),
+            LamellarWriteArray::GlobalLockArray(array) => array.block_on_all(iter),
+        }
+    }
+}
+
 // impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T> {
 //     fn reduce(&self, reduction: &str) -> AmHandle<T> {
 //         match self {
@@ -844,13 +1067,13 @@ pub(crate) mod private {
     pub(crate) trait ArrayExecAm<T: Dist> {
         fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
         fn team_counters(&self) -> Arc<AMCounters>;
-        fn exec_am_local<F>(&self, am: F) -> LocalAmHandle<F::Output>
+        fn exec_am_local_tg<F>(&self, am: F) -> LocalAmHandle<F::Output>
         where
             F: LamellarActiveMessage + LocalAM + 'static,
         {
             self.team().exec_am_local_tg(am, Some(self.team_counters()))
         }
-        fn exec_am_pe<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
+        fn exec_am_pe_tg<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
         where
             F: RemoteActiveMessage + LamellarAM + AmDist,
         {
@@ -864,19 +1087,19 @@ pub(crate) mod private {
         //     self.team()
         //         .exec_arc_am_pe(pe, am, Some(self.team_counters()))
         // }
-        // fn exec_am_all<F>(&self, am: F) -> MultiAmHandle<F::Output>
-        // where
-        //     F: RemoteActiveMessage + LamellarAM + AmDist,
-        // {
-        //     self.team().exec_am_all_tg(am, Some(self.team_counters()))
-        // }
+        fn exec_am_all_tg<F>(&self, am: F) -> MultiAmHandle<F::Output>
+        where
+            F: RemoteActiveMessage + LamellarAM + AmDist,
+        {
+            self.team().exec_am_all_tg(am, Some(self.team_counters()))
+        }
     }
 }
 
 /// Represents a distributed array, providing some convenience functions for getting simple information about the array.
 /// This is mostly intended for use within the runtime (specifically for use in Proc Macros) but the available functions may be useful to endusers as well.
 #[enum_dispatch(LamellarReadArray<T>,LamellarWriteArray<T>)]
-pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> {
+pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessaging {
     #[doc(alias("One-sided", "onesided"))]
     /// Returns the team used to construct this array, the PEs in the team represent the same PEs which have a slice of data of the array
     ///
@@ -972,64 +1195,64 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> {
     ///```
     // fn use_distribution(self, distribution: Distribution) -> Self;
 
-    #[doc(alias = "Collective")]
-    /// Global synchronization method which blocks calling thread until all PEs in the owning Array data have entered the barrier
-    ///
-    /// # Collective Operation
-    /// Requires all PEs associated with the array to enter the barrier, otherwise deadlock will occur
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// array.barrier();
-    ///```
-    fn barrier(&self);
+    // #[doc(alias = "Collective")]
+    // /// Global synchronization method which blocks calling thread until all PEs in the owning Array data have entered the barrier
+    // ///
+    // /// # Collective Operation
+    // /// Requires all PEs associated with the array to enter the barrier, otherwise deadlock will occur
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// array.barrier();
+    // ///```
+    // fn barrier(&self);
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// blocks calling thread until all remote tasks (e.g. element wise operations)
-    /// initiated by the calling PE have completed.
-    ///
-    /// # One-sided Operation
-    /// this is not a distributed synchronization primitive (i.e. it has no knowledge of a Remote PEs tasks), the calling thread will only wait for tasks
-    /// to finish that were initiated by the calling PE itself
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// for i in 0..100{
-    ///     array.add(i,1);
-    /// }
-    /// array.wait_all(); //block until the previous add operations have finished
-    ///```
-    fn wait_all(&self);
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// blocks calling thread until all remote tasks (e.g. element wise operations)
+    // /// initiated by the calling PE have completed.
+    // ///
+    // /// # One-sided Operation
+    // /// this is not a distributed synchronization primitive (i.e. it has no knowledge of a Remote PEs tasks), the calling thread will only wait for tasks
+    // /// to finish that were initiated by the calling PE itself
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// for i in 0..100{
+    // ///     array.add(i,1);
+    // /// }
+    // /// array.wait_all(); //block until the previous add operations have finished
+    // ///```
+    // fn wait_all(&self);
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Run a future to completion on the current thread
-    ///
-    /// This function will block the caller until the given future has completed, the future is executed within the Lamellar threadpool
-    ///
-    /// Users can await any future, including those returned from lamellar remote operations
-    ///
-    /// # One-sided Operation
-    /// this is not a distributed synchronization primitive and only blocks the calling thread until the given future has completed on the calling PE
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let request = array.fetch_add(10,1000); //fetch index 10 and add 1000 to it
-    /// let result = array.block_on(request); //block until am has executed
-    /// // we also could have used world.block_on() or team.block_on()
-    ///```
-    fn block_on<F: Future>(&self, f: F) -> F::Output;
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Run a future to completion on the current thread
+    // ///
+    // /// This function will block the caller until the given future has completed, the future is executed within the Lamellar threadpool
+    // ///
+    // /// Users can await any future, including those returned from lamellar remote operations
+    // ///
+    // /// # One-sided Operation
+    // /// this is not a distributed synchronization primitive and only blocks the calling thread until the given future has completed on the calling PE
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let request = array.fetch_add(10,1000); //fetch index 10 and add 1000 to it
+    // /// let result = array.block_on(request); //block until am has executed
+    // /// // we also could have used world.block_on() or team.block_on()
+    // ///```
+    // fn block_on<F: Future>(&self, f: F) -> F::Output;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Given a global index, calculate the PE and offset on that PE where the element actually resides.
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index e8ff5ef6..ddd4f92b 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -2,6 +2,7 @@ mod iteration;
 pub(crate) mod operations;
 pub(crate) mod rdma;
 
+use crate::active_messaging::ActiveMessaging;
 use crate::array::generic_atomic::{GenericAtomicElement, LocalGenericAtomicElement};
 use crate::array::iterator::distributed_iterator::DistIteratorLauncher;
 use crate::array::iterator::local_iterator::LocalIteratorLauncher;
@@ -10,8 +11,11 @@ use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
 use crate::config;
 // use crate::darc::{Darc, DarcMode};
+use crate::barrier::BarrierHandle;
 use crate::lamellar_team::IntoLamellarTeam;
 use crate::memregion::Dist;
+use crate::scheduler::LamellarTask;
+
 use std::any::TypeId;
 use std::collections::HashSet;
 // use std::sync::atomic::Ordering;
@@ -557,6 +561,91 @@ impl<T: Dist> SubArray<T> for AtomicArray<T> {
     }
 }
 
+impl<T: Dist> ActiveMessaging for AtomicArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.exec_am_all(am),
+            AtomicArray::GenericAtomicArray(array) => array.exec_am_all(am),
+        }
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.exec_am_pe(pe, am),
+            AtomicArray::GenericAtomicArray(array) => array.exec_am_pe(pe, am),
+        }
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.exec_am_local(am),
+            AtomicArray::GenericAtomicArray(array) => array.exec_am_local(am),
+        }
+    }
+    fn wait_all(&self) {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.wait_all(),
+            AtomicArray::GenericAtomicArray(array) => array.wait_all(),
+        }
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        let fut: Pin<Box<dyn Future<Output = ()> + Send>> = match self {
+            AtomicArray::NativeAtomicArray(array) => Box::pin(array.await_all()),
+            AtomicArray::GenericAtomicArray(array) => Box::pin(array.await_all()),
+        };
+        fut
+    }
+    fn barrier(&self) {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.barrier(),
+            AtomicArray::GenericAtomicArray(array) => array.barrier(),
+        }
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.async_barrier(),
+            AtomicArray::GenericAtomicArray(array) => array.async_barrier(),
+        }
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.spawn(f),
+            AtomicArray::GenericAtomicArray(array) => array.spawn(f),
+        }
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.block_on(f),
+            AtomicArray::GenericAtomicArray(array) => array.block_on(f),
+        }
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        match self {
+            AtomicArray::NativeAtomicArray(array) => array.block_on_all(iter),
+            AtomicArray::GenericAtomicArray(array) => array.block_on_all(iter),
+        }
+    }
+}
+
 #[doc(hidden)]
 #[enum_dispatch]
 #[derive(serde::Serialize, serde::Deserialize, Clone)]
@@ -1262,7 +1351,7 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `AtomicArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -1360,7 +1449,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `AtomicArray::blocking_sum` from within an async context which may lead to deadlock, it is recommended that you use `sum().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -1454,7 +1543,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `AtomicArray::blocking_prod` from within an async context which may lead to deadlock, it is recommended that you use `prod().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -1541,7 +1630,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `AtomicArray::blocking_max` from within an async context which may lead to deadlock, it is recommended that you use `max().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -1627,7 +1716,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `AtomicArray::blocking_min` from within an async context which may lead to deadlock, it is recommended that you use `min().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 6ffa395d..4ba36c2b 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -3,12 +3,16 @@ pub(crate) mod operations;
 mod rdma;
 use crate::array::atomic::AtomicElement;
 // use crate::array::private::LamellarArrayPrivate;
+use crate::array::private::ArrayExecAm;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
+use crate::barrier::BarrierHandle;
 use crate::darc::Darc;
 use crate::darc::DarcMode;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
+use crate::scheduler::LamellarTask;
+
 use parking_lot::{Mutex, MutexGuard};
 use serde::ser::SerializeSeq;
 // use std::ops::{Deref, DerefMut};
@@ -566,10 +570,6 @@ impl<T: Dist> GenericAtomicArray<T> {
             .expect("invalid local index");
         self.locks[index].lock()
     }
-
-    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-    //     self.array.async_barrier()
-    // }
 }
 
 impl<T: Dist + 'static> GenericAtomicArray<T> {
@@ -721,6 +721,60 @@ impl<T: Dist> private::LamellarArrayPrivate<T> for GenericAtomicArray<T> {
     }
 }
 
+impl<T: Dist> ActiveMessaging for GenericAtomicArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_all_tg(am)
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_pe_tg(pe, am)
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        self.array.exec_am_local_tg(am)
+    }
+    fn wait_all(&self) {
+        self.array.wait_all()
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        self.array.await_all()
+    }
+    fn barrier(&self) {
+        self.array.barrier()
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        self.array.async_barrier()
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.array.spawn(f)
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        self.array.block_on(f)
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        self.array.block_on_all(iter)
+    }
+}
+
 impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt().clone()
@@ -737,17 +791,17 @@ impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
     fn num_elems_local(&self) -> usize {
         self.array.num_elems_local()
     }
-    fn barrier(&self) {
-        self.array.barrier();
-    }
+    // fn barrier(&self) {
+    //     self.array.barrier();
+    // }
 
-    fn wait_all(&self) {
-        self.array.wait_all()
-        // println!("done in wait all {:?}",std::time::SystemTime::now());
-    }
-    fn block_on<F: Future>(&self, f: F) -> F::Output {
-        self.array.block_on(f)
-    }
+    // fn wait_all(&self) {
+    //     self.array.wait_all()
+    //     // println!("done in wait all {:?}",std::time::SystemTime::now());
+    // }
+    // fn block_on<F: Future>(&self, f: F) -> F::Output {
+    //     self.array.block_on(f)
+    // }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
         self.array.pe_and_offset_for_global_index(index)
     }
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 355845ca..0d182608 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -117,7 +117,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am));
+            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -248,7 +248,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe(pe, remote_am));
+                            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -301,7 +301,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am));
+                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 6ebd5a6b..33f464e6 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -1,9 +1,11 @@
 mod iteration;
 pub(crate) mod operations;
 mod rdma;
+use crate::array::private::ArrayExecAm;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
+use crate::barrier::BarrierHandle;
 use crate::config;
 use crate::darc::global_rw_darc::{
     GlobalRwDarc, GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
@@ -12,6 +14,7 @@ use crate::darc::DarcMode;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
+use crate::scheduler::LamellarTask;
 
 use pin_project::pin_project;
 
@@ -376,11 +379,11 @@ impl<T: Dist> GlobalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: GlobalLockArray<T> = self.clone();
@@ -445,11 +448,11 @@ impl<T: Dist> GlobalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: GlobalLockArray<T> = self.clone();
@@ -514,11 +517,11 @@ impl<T: Dist> GlobalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: GlobalLockArray<T> = self.clone();
@@ -591,11 +594,11 @@ impl<T: Dist> GlobalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: GlobalLockArray<T> = self.clone();
@@ -670,11 +673,11 @@ impl<T: Dist> GlobalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: GlobalLockArray<T> = self.clone();
@@ -869,10 +872,6 @@ impl<T: Dist> GlobalLockArray<T> {
         // println!("GlobalLock into_read_only");
         self.array.into()
     }
-
-    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-    //     self.array.async_barrier()
-    // }
 }
 
 impl<T: Dist + 'static> GlobalLockArray<T> {
@@ -1062,6 +1061,60 @@ impl<T: Dist> private::LamellarArrayPrivate<T> for GlobalLockArray<T> {
     }
 }
 
+impl<T: Dist> ActiveMessaging for GlobalLockArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_all_tg(am)
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_pe_tg(pe, am)
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        self.array.exec_am_local_tg(am)
+    }
+    fn wait_all(&self) {
+        self.array.wait_all()
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        self.array.await_all()
+    }
+    fn barrier(&self) {
+        self.array.barrier()
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        self.array.async_barrier()
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.array.spawn(f)
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        self.array.block_on(f)
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        self.array.block_on_all(iter)
+    }
+}
+
 impl<T: Dist> LamellarArray<T> for GlobalLockArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt().clone()
@@ -1078,16 +1131,16 @@ impl<T: Dist> LamellarArray<T> for GlobalLockArray<T> {
     fn num_elems_local(&self) -> usize {
         self.array.num_elems_local()
     }
-    fn barrier(&self) {
-        self.array.barrier();
-    }
-    fn wait_all(&self) {
-        self.array.wait_all()
-        // println!("done in wait all {:?}",std::time::SystemTime::now());
-    }
-    fn block_on<F: Future>(&self, f: F) -> F::Output {
-        self.array.block_on(f)
-    }
+    // fn barrier(&self) {
+    //     self.array.barrier();
+    // }
+    // fn wait_all(&self) {
+    //     self.array.wait_all()
+    //     // println!("done in wait all {:?}",std::time::SystemTime::now());
+    // }
+    // fn block_on<F: Future>(&self, f: F) -> F::Output {
+    //     self.array.block_on(f)
+    // }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
         self.array.pe_and_offset_for_global_index(index)
     }
@@ -1260,7 +1313,7 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index 9fc9110e..7a19c9d4 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -22,7 +22,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        let req = self.exec_am_local(InitGetAm {
+        let req = self.exec_am_local_tg(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
@@ -34,7 +34,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
         let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
-        let req = self.exec_am_local(InitGetAm {
+        let req = self.exec_am_local_tg(InitGetAm {
             array: self.clone(),
             index: index,
             buf: buf.clone().into(),
@@ -72,7 +72,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GlobalLockArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        let req = self.exec_am_local(InitPutAm {
+        let req = self.exec_am_local_tg(InitPutAm {
             array: self.clone(),
             index: index,
             buf: buf.into(),
@@ -127,7 +127,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am));
+            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -232,7 +232,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                         .into(),
                                     pe: self.array.my_pe(),
                                 };
-                                reqs.push(self.array.exec_am_pe(pe, remote_am));
+                                reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                             } else {
                                 let remote_am = GlobalLockRemoteSmallPutAm {
                                     array: self.array.clone().into(), //inner of the indices we need to place data into
@@ -242,7 +242,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                         [cur_index..(cur_index + u8_buf_len)]
                                         .to_vec(),
                                 };
-                                reqs.push(self.array.exec_am_pe(pe, remote_am));
+                                reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                             }
                             cur_index += u8_buf_len;
                         } else {
@@ -297,7 +297,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am));
+                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 6be407b9..5e12f193 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -3,15 +3,19 @@ pub(crate) mod local_chunks;
 pub use local_chunks::{LocalLockLocalChunks, LocalLockLocalChunksMut};
 pub(crate) mod operations;
 mod rdma;
+use crate::array::private::ArrayExecAm;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
+use crate::barrier::BarrierHandle;
 use crate::config;
 use crate::darc::local_rw_darc::LocalRwDarc;
 use crate::darc::DarcMode;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
+use crate::scheduler::LamellarTask;
+
 // use parking_lot::{
 //     lock_api::{ArcRwLockReadGuard, ArcRwLockWriteGuard},
 //     RawRwLock,
@@ -350,11 +354,11 @@ impl<T: Dist> LocalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: LocalLockArray<T> = self.clone();
@@ -418,11 +422,11 @@ impl<T: Dist> LocalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: LocalLockArray<T> = self.clone();
@@ -484,11 +488,11 @@ impl<T: Dist> LocalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: LocalLockArray<T> = self.clone();
@@ -556,11 +560,11 @@ impl<T: Dist> LocalLockArray<T> {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
         }
         let self_clone: LocalLockArray<T> = self.clone();
@@ -755,10 +759,6 @@ impl<T: Dist> LocalLockArray<T> {
         // println!("readonly into_global_lock");
         self.array.into()
     }
-
-    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-    //     self.array.async_barrier()
-    // }
 }
 
 impl<T: Dist + 'static> LocalLockArray<T> {
@@ -948,6 +948,60 @@ impl<T: Dist> private::LamellarArrayPrivate<T> for LocalLockArray<T> {
     }
 }
 
+impl<T: Dist> ActiveMessaging for LocalLockArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_all_tg(am)
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_pe_tg(pe, am)
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        self.array.exec_am_local_tg(am)
+    }
+    fn wait_all(&self) {
+        self.array.wait_all()
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        self.array.await_all()
+    }
+    fn barrier(&self) {
+        self.array.barrier()
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        self.array.async_barrier()
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.array.spawn(f)
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        self.array.block_on(f)
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        self.array.block_on_all(iter)
+    }
+}
+
 impl<T: Dist> LamellarArray<T> for LocalLockArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt().clone()
@@ -964,16 +1018,16 @@ impl<T: Dist> LamellarArray<T> for LocalLockArray<T> {
     fn num_elems_local(&self) -> usize {
         self.array.num_elems_local()
     }
-    fn barrier(&self) {
-        self.array.barrier();
-    }
-    fn wait_all(&self) {
-        self.array.wait_all()
-        // println!("done in wait all {:?}",std::time::SystemTime::now());
-    }
-    fn block_on<F: Future>(&self, f: F) -> F::Output {
-        self.array.block_on(f)
-    }
+    // fn barrier(&self) {
+    //     self.array.barrier();
+    // }
+    // fn wait_all(&self) {
+    //     self.array.wait_all()
+    //     // println!("done in wait all {:?}",std::time::SystemTime::now());
+    // }
+    // fn block_on<F: Future>(&self, f: F) -> F::Output {
+    //     self.array.block_on(f)
+    // }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
         self.array.pe_and_offset_for_global_index(index)
     }
@@ -1151,7 +1205,7 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index db155cfc..47b6ee9c 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -269,7 +269,7 @@ impl<T: Dist> LocalLockArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_chunks` from within an async context which may lead to deadlock, it is recommended that you use `read_local_chunks().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -340,7 +340,7 @@ impl<T: Dist> LocalLockArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_chunks` from within an async context which may lead to deadlock, it is recommended that you use `write_local_chunks().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index 5c98dbf0..71958071 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -120,7 +120,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am));
+            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -230,7 +230,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe(pe, remote_am));
+                            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -283,7 +283,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am));
+                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 83946563..ff664ea4 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -6,9 +6,13 @@ use crate::array::atomic::AtomicElement;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
 // use crate::darc::Darc;
+use crate::array::private::ArrayExecAm;
+use crate::barrier::BarrierHandle;
 use crate::darc::DarcMode;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
+use crate::scheduler::LamellarTask;
+
 use serde::ser::SerializeSeq;
 use std::any::TypeId;
 use std::ops::{
@@ -963,10 +967,6 @@ impl<T: Dist> NativeAtomicArray<T> {
         // println!("native into_read_only");
         self.array.into()
     }
-
-    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-    //     self.array.async_barrier()
-    // }
 }
 
 impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
@@ -1112,6 +1112,60 @@ impl<T: Dist> private::LamellarArrayPrivate<T> for NativeAtomicArray<T> {
     }
 }
 
+impl<T: Dist> ActiveMessaging for NativeAtomicArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_all_tg(am)
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_pe_tg(pe, am)
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        self.array.exec_am_local_tg(am)
+    }
+    fn wait_all(&self) {
+        self.array.wait_all()
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        self.array.await_all()
+    }
+    fn barrier(&self) {
+        self.array.barrier()
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        self.array.async_barrier()
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.array.spawn(f)
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        self.array.block_on(f)
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        self.array.block_on_all(iter)
+    }
+}
+
 //#[doc(hidden)]
 impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
@@ -1129,16 +1183,16 @@ impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
     fn num_elems_local(&self) -> usize {
         self.array.num_elems_local()
     }
-    fn barrier(&self) {
-        self.array.barrier();
-    }
-    fn wait_all(&self) {
-        self.array.wait_all()
-        // println!("done in wait all {:?}",std::time::SystemTime::now());
-    }
-    fn block_on<F: Future>(&self, f: F) -> F::Output {
-        self.array.block_on(f)
-    }
+    // fn barrier(&self) {
+    //     self.array.barrier();
+    // }
+    // fn wait_all(&self) {
+    //     self.array.wait_all()
+    //     // println!("done in wait all {:?}",std::time::SystemTime::now());
+    // }
+    // fn block_on<F: Future>(&self, f: F) -> F::Output {
+    //     self.array.block_on(f)
+    // }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
         self.array.pe_and_offset_for_global_index(index)
     }
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index b9093c5d..76ab43f5 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -116,7 +116,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am));
+            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -229,7 +229,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe(pe, remote_am));
+                            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -282,7 +282,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe(pe, remote_am));
+                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 210fbf67..f79d22ba 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -2,67 +2,17 @@ mod iteration;
 pub(crate) mod local_chunks;
 pub use local_chunks::ReadOnlyLocalChunks;
 mod rdma;
+use crate::array::private::ArrayExecAm;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
+use crate::barrier::BarrierHandle;
 use crate::config;
 use crate::darc::DarcMode;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
-use std::sync::Arc;
-
-// type BufFn = fn(ReadOnlyByteArrayWeak) -> Arc<dyn BufferOp>;
-
-// type MultiMultiFn = fn(ReadOnlyByteArray,ArrayOpCmd,Vec<u8>) -> LamellarArcAm;
-// type MultiSingleFn = fn(ReadOnlyByteArray,ArrayOpCmd,Vec<u8>,Vec<usize>) -> LamellarArcAm;
-
-// lazy_static! {
-// pub(crate) static ref BUFOPS: HashMap<TypeId, BufFn> = {
-//     let mut map = HashMap::new();
-//     for op in crate::inventory::iter::<ReadOnlyArrayOpBuf> {
-//         map.insert(op.id.clone(), op.op);
-//     }
-//     map
-// };
-
-// pub(crate) static ref MULTIMULTIOPS: HashMap<TypeId, MultiMultiFn> = {
-//     let mut map = HashMap::new();
-//     for op in crate::inventory::iter::<ReadOnlyArrayMultiMultiOps> {
-//         map.insert(op.id.clone(), op.op);
-//     }
-//     map
-// };
-
-// pub(crate) static ref MULTISINGLEOPS: HashMap<TypeId, MultiSingleFn> = {
-//     let mut map = HashMap::new();
-//     for op in crate::inventory::iter::<ReadOnlyArrayMultiSingleOps> {
-//         map.insert(op.id.clone(), op.op);
-//     }
-//     map
-// };
+use crate::scheduler::LamellarTask;
 
-// }
-
-// //#[doc(hidden)]
-// pub struct ReadOnlyArrayOpBuf {
-//     pub id: TypeId,
-//     pub op: BufFn,
-// }
-
-// //#[doc(hidden)]
-// pub struct ReadOnlyArrayMultiMultiOps {
-//     pub id: TypeId,
-//     pub op: MultiMultiFn,
-// }
-
-// //#[doc(hidden)]
-// pub struct ReadOnlyArrayMultiSingleOps {
-//     pub id: TypeId,
-//     pub op: MultiSingleFn,
-// }
-
-// crate::inventory::collect!(ReadOnlyArrayOpBuf);
-// crate::inventory::collect!(ReadOnlyArrayMultiMultiOps);
-// crate::inventory::collect!(ReadOnlyArrayMultiSingleOps);
+use std::sync::Arc;
 
 /// A safe abstraction of a distributed array, providing only read access.
 #[lamellar_impl::AmDataRT(Clone, Debug)]
@@ -336,10 +286,6 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
         // println!("readonly into_global_lock");
         self.array.into()
     }
-
-    // pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
-    //     self.array.async_barrier()
-    // }
 }
 
 impl<T: Dist + 'static> ReadOnlyArray<T> {
@@ -553,7 +499,7 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `ReadOnlyArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -807,6 +753,60 @@ impl<T: Dist> private::LamellarArrayPrivate<T> for ReadOnlyArray<T> {
     }
 }
 
+impl<T: Dist> ActiveMessaging for ReadOnlyArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_all_tg(am)
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.array.exec_am_pe_tg(pe, am)
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        self.array.exec_am_local_tg(am)
+    }
+    fn wait_all(&self) {
+        self.array.wait_all()
+    }
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        self.array.await_all()
+    }
+    fn barrier(&self) {
+        self.array.barrier()
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        self.array.async_barrier()
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.array.spawn(f)
+    }
+    fn block_on<F: Future>(&self, f: F) -> F::Output {
+        self.array.block_on(f)
+    }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        self.array.block_on_all(iter)
+    }
+}
+
 impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt().clone()
@@ -823,17 +823,17 @@ impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
     fn num_elems_local(&self) -> usize {
         self.array.num_elems_local()
     }
-    fn barrier(&self) {
-        self.array.barrier();
-    }
+    // fn barrier(&self) {
+    //     self.array.barrier();
+    // }
 
-    fn wait_all(&self) {
-        self.array.wait_all()
-        // println!("done in wait all {:?}",std::time::SystemTime::now());
-    }
-    fn block_on<F: Future>(&self, f: F) -> F::Output {
-        self.array.block_on(f)
-    }
+    // fn wait_all(&self) {
+    //     self.array.wait_all()
+    //     // println!("done in wait all {:?}",std::time::SystemTime::now());
+    // }
+    // fn block_on<F: Future>(&self, f: F) -> F::Output {
+    //     self.array.block_on(f)
+    // }
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
         self.array.pe_and_offset_for_global_index(index)
     }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 17711f2c..663f8168 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -5,6 +5,7 @@ pub(crate) mod local_chunks;
 pub(crate) mod operations;
 mod rdma;
 
+use crate::active_messaging::ActiveMessaging;
 use crate::active_messaging::*;
 // use crate::array::r#unsafe::operations::BUFOPS;
 use crate::array::private::{ArrayExecAm, LamellarArrayPrivate};
@@ -16,6 +17,7 @@ use crate::env_var::config;
 use crate::lamellae::AllocationType;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::{Dist, MemoryRegion};
+use crate::scheduler::LamellarTask;
 use crate::LamellarTaskGroup;
 
 use core::marker::PhantomData;
@@ -706,7 +708,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         self.inner.data.team.tasking_barrier();
     }
 
-    pub(crate) fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    pub(crate) fn async_barrier(&self) -> BarrierHandle {
         self.inner.data.team.async_barrier()
     }
 }
@@ -820,7 +822,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `Array::team_from` from within an async context which may lead to deadlock, this is unintended and likely a Runtime bug.
                 Please open a github issue at https://github.com/pnnl/lamellar-runtime/issues including a backtrace if possible.
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -989,31 +991,37 @@ impl<T: Dist> LamellarArrayPrivate<T> for UnsafeArray<T> {
     }
 }
 
-impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.inner.data.team.clone()
-    }
-
-    // fn my_pe(&self) -> usize {
-    //     self.inner.data.my_pe
-    // }
-
-    // fn num_pes(&self) -> usize {
-    //     self.inner.data.num_pes
-    // }
-
-    fn len(&self) -> usize {
-        self.inner.size
-    }
-
-    fn num_elems_local(&self) -> usize {
-        self.inner.num_elems_local()
-    }
-
-    fn barrier(&self) {
-        self.inner.data.team.tasking_barrier();
+impl<T: Dist> ActiveMessaging for UnsafeArray<T> {
+    type SinglePeAmHandle<R: AmDist> = AmHandle<R>;
+    type MultiAmHandle<R: AmDist> = MultiAmHandle<R>;
+    type LocalAmHandle<L> = LocalAmHandle<L>;
+    fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.inner
+            .data
+            .team
+            .exec_am_all_tg(am, Some(self.team_counters()))
+    }
+    fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
+    {
+        self.inner
+            .data
+            .team
+            .exec_am_pe_tg(pe, am, Some(self.team_counters()))
+    }
+    fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
+    where
+        F: LamellarActiveMessage + LocalAM + 'static,
+    {
+        self.inner
+            .data
+            .team
+            .exec_am_local_tg(am, Some(self.team_counters()))
     }
-
     fn wait_all(&self) {
         let mut temp_now = Instant::now();
         // let mut first = true;
@@ -1051,12 +1059,104 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
             }
         }
         self.inner.data.task_group.wait_all();
-        // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
-
+    fn await_all(&self) -> impl Future<Output = ()> + Send {
+        self.await_all()
+    }
+    fn barrier(&self) {
+        self.inner.data.team.barrier()
+    }
+    fn async_barrier(&self) -> BarrierHandle {
+        self.inner.data.team.async_barrier()
+    }
+    fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.inner.data.team.scheduler.spawn_task(f)
+    }
     fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.inner.data.team.scheduler.block_on(f)
     }
+    fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        self.inner.data.team.block_on_all(iter)
+    }
+}
+
+impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
+        self.inner.data.team.clone()
+    }
+
+    // fn my_pe(&self) -> usize {
+    //     self.inner.data.my_pe
+    // }
+
+    // fn num_pes(&self) -> usize {
+    //     self.inner.data.num_pes
+    // }
+
+    fn len(&self) -> usize {
+        self.inner.size
+    }
+
+    fn num_elems_local(&self) -> usize {
+        self.inner.num_elems_local()
+    }
+
+    // fn barrier(&self) {
+    //     self.inner.data.team.tasking_barrier();
+    // }
+
+    // fn wait_all(&self) {
+    //     let mut temp_now = Instant::now();
+    //     // let mut first = true;
+    //     while self
+    //         .inner
+    //         .data
+    //         .array_counters
+    //         .outstanding_reqs
+    //         .load(Ordering::SeqCst)
+    //         > 0
+    //         || self.inner.data.req_cnt.load(Ordering::SeqCst) > 0
+    //     {
+    //         // std::thread::yield_now();
+    //         // self.inner.data.team.flush();
+    //         self.inner.data.team.scheduler.exec_task(); //mmight as well do useful work while we wait
+    //         if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
+    //             //|| first{
+    //             println!(
+    //                 "in array wait_all mype: {:?} cnt: {:?} {:?} {:?}",
+    //                 self.inner.data.team.world_pe,
+    //                 self.inner
+    //                     .data
+    //                     .array_counters
+    //                     .send_req_cnt
+    //                     .load(Ordering::SeqCst),
+    //                 self.inner
+    //                     .data
+    //                     .array_counters
+    //                     .outstanding_reqs
+    //                     .load(Ordering::SeqCst),
+    //                 self.inner.data.req_cnt.load(Ordering::SeqCst)
+    //             );
+    //             temp_now = Instant::now();
+    //             // first = false;
+    //         }
+    //     }
+    //     self.inner.data.task_group.wait_all();
+    //     // println!("done in wait all {:?}",std::time::SystemTime::now());
+    // }
+
+    // fn block_on<F: Future>(&self, f: F) -> F::Output {
+    //     self.inner.data.team.scheduler.block_on(f)
+    // }
 
     //#[tracing::instrument(skip_all)]
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
@@ -1285,7 +1385,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `UnsafeArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 732b9672..d8c4d751 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -101,7 +101,7 @@ macro_rules! consumer_impl {
             //         let name = stringify!{$name};
             //         let msg = format!("
             //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
-            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             //         );
             //         if let Some(val) = config().blocking_call_warning {
             //             if val {
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 7ef57f5f..b2212ee8 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -69,7 +69,7 @@ macro_rules! consumer_impl {
             //         let name = stringify!{$name};
             //         let msg = format!("
             //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
-            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             //         );
             //         if let Some(val) = config().blocking_call_warning {
             //             if val {
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 142a6149..c290b62f 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -99,7 +99,7 @@ impl<T: Dist> UnsafeArray<T> {
                                 },
                                 pe: self.inner.data.my_pe,
                             };
-                            reqs.push_back(self.exec_am_pe(pe, am));
+                            reqs.push_back(self.exec_am_pe_tg(pe, am));
                         } else {
                             let am = UnsafeSmallPutAm {
                                 array: self.clone().into(),
@@ -113,7 +113,7 @@ impl<T: Dist> UnsafeArray<T> {
                                         .to_vec()
                                 },
                             };
-                            reqs.push_back(self.exec_am_pe(pe, am));
+                            reqs.push_back(self.exec_am_pe_tg(pe, am));
                         }
                     }
                     ArrayRdmaCmd::GetAm => {
@@ -219,7 +219,7 @@ impl<T: Dist> UnsafeArray<T> {
                             data: unsafe { temp_memreg.to_base::<u8>().into() },
                             pe: self.inner.data.my_pe,
                         };
-                        reqs.push_back(self.exec_am_pe(pe, am));
+                        reqs.push_back(self.exec_am_pe_tg(pe, am));
                     } else {
                         let am = UnsafeSmallPutAm {
                             array: self.clone().into(),
@@ -234,7 +234,7 @@ impl<T: Dist> UnsafeArray<T> {
                                     .to_vec()
                             },
                         };
-                        reqs.push_back(self.exec_am_pe(pe, am));
+                        reqs.push_back(self.exec_am_pe_tg(pe, am));
                     }
                     if pe + 1 == num_pes {
                         overflow += 1;
@@ -1083,7 +1083,7 @@ impl<T: Dist + 'static> LamellarAm for InitSmallGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe(pe, remote_am));
+            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.inner.distribution {
diff --git a/src/barrier.rs b/src/barrier.rs
index 6b771901..e15ded71 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -453,7 +453,7 @@ impl Barrier {
 // }
 
 #[pin_project]
-pub(crate) struct BarrierHandle {
+pub struct BarrierHandle {
     barrier_buf: Arc<Vec<MemoryRegion<usize>>>,
     arch: Arc<LamellarArchRT>,
     lamellae: Arc<Lamellae>,
diff --git a/src/darc.rs b/src/darc.rs
index 1302f32c..1820974e 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1370,7 +1370,7 @@ impl<T> Darc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `Darc::blocking_into_localrw` from within an async context which may lead to deadlock, it is recommended that you use `into_localrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             if let Some(val) = config().blocking_call_warning {
                 if val {
@@ -1476,7 +1476,7 @@ impl<T> Darc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `Darc::blocking_into_globalrw` from within an async context which may lead to deadlock, it is recommended that you use `into_globalrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index b74c4b49..f483b21d 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -762,7 +762,7 @@ impl<T> GlobalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_read` from within an async context which may lead to deadlock, it is recommended that you use `read().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -826,7 +826,7 @@ impl<T> GlobalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_write` from within an async context which may lead to deadlock, it is recommended that you use `write().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -903,7 +903,7 @@ impl<T> GlobalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_collective_write` from within an async context which may lead to deadlock, it is recommended that you use `collective_write().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -1048,7 +1048,7 @@ impl<T> GlobalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_into_darc` from within an async context which may lead to deadlock, it is recommended that you use `into_darc().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -1146,7 +1146,7 @@ impl<T> GlobalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_into_localrw` from within an async context which may lead to deadlock, it is recommended that you use `into_localrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 379ffb80..3215eee0 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -170,7 +170,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_read` from within an async context which may lead to deadlock, it is recommended that you use `read().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -276,7 +276,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_write` from within an async context which may lead to deadlock, it is recommended that you use `write().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -456,7 +456,7 @@ impl<T> LocalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_into_globalrw` from within an async context which may lead to deadlock, it is recommended that you use `into_globalrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
@@ -575,7 +575,7 @@ impl<T: Send + Sync> LocalRwDarc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
             let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_into_darc` from within an async context which may lead to deadlock, it is recommended that you use `into_darc().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture()
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
             match config().blocking_call_warning {
                 Some(val) if val => println!("{msg}"),
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index abe0e0ef..08552158 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -1,5 +1,6 @@
 use crate::active_messaging::registered_active_message::{AmId, AMS_EXECS, AMS_IDS, AM_ID_START};
 use crate::active_messaging::*;
+use crate::barrier::BarrierHandle;
 use crate::env_var::config;
 use crate::lamellae::Des;
 use crate::lamellar_arch::LamellarArchRT;
@@ -550,7 +551,7 @@ impl ActiveMessaging for LamellarTaskGroup {
         self.team.barrier();
     }
 
-    fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send {
+    fn async_barrier(&self) -> BarrierHandle {
         self.team.async_barrier()
     }
 
@@ -676,11 +677,11 @@ impl LamellarTaskGroup {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
             exec_task = false;
         }
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 6983750a..7ff125ad 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1,6 +1,6 @@
 use crate::active_messaging::handle::AmHandleInner;
 use crate::active_messaging::*;
-use crate::barrier::Barrier;
+use crate::barrier::{Barrier, BarrierHandle};
 use crate::env_var::config;
 use crate::lamellae::{AllocationType, Lamellae, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_arch::{GlobalArch, IdError, LamellarArch, LamellarArchEnum, LamellarArchRT};
@@ -399,7 +399,7 @@ impl LamellarTeam {
     /// //do some work
     /// world.barrier(); //block until all PEs have entered the barrier
     ///```
-    pub fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send + '_ {
+    pub fn async_barrier(&self) -> BarrierHandle {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         self.team.async_barrier()
@@ -518,7 +518,7 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         self.team.barrier();
     }
 
-    fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send {
+    fn async_barrier(&self) -> BarrierHandle {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         self.team.async_barrier()
@@ -1371,11 +1371,11 @@ impl LamellarTeamRT {
             if let Some(val) = config().blocking_call_warning {
                 if val {
                     println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
                 }
             } else {
                 println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {:?}", std::backtrace::Backtrace::capture());
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
             }
             exec_task = false;
         }
@@ -1433,6 +1433,21 @@ impl LamellarTeamRT {
         self.scheduler.block_on(f)
     }
 
+    pub(crate) fn block_on_all<I>(
+        &self,
+        iter: I,
+    ) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
+    where
+        I: IntoIterator,
+        <I as IntoIterator>::Item: Future + Send + 'static,
+        <<I as IntoIterator>::Item as Future>::Output: Send,
+    {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+        self.scheduler.block_on(join_all(
+            iter.into_iter().map(|task| self.scheduler.spawn_task(task)),
+        ))
+    }
+
     //#[tracing::instrument(skip_all)]
     pub(crate) fn barrier(&self) {
         self.barrier.barrier();
@@ -1442,8 +1457,8 @@ impl LamellarTeamRT {
     pub(crate) fn tasking_barrier(&self) {
         self.barrier.tasking_barrier();
     }
-    pub(crate) async fn async_barrier(&self) {
-        self.barrier.async_barrier().await;
+    pub(crate) fn async_barrier(&self) -> BarrierHandle {
+        self.barrier.barrier_handle()
     }
 
     pub(crate) fn flush(&self) {
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 6d91eaf1..109b18d2 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -1,3 +1,4 @@
+use crate::barrier::BarrierHandle;
 use crate::lamellae::{create_lamellae, Backend, Lamellae, LamellaeComm, LamellaeInit};
 use crate::lamellar_arch::LamellarArch;
 use crate::lamellar_env::LamellarEnv;
@@ -81,7 +82,7 @@ impl ActiveMessaging for LamellarWorld {
         self.team.barrier();
     }
 
-    fn async_barrier(&self) -> impl std::future::Future<Output = ()> + Send {
+    fn async_barrier(&self) -> BarrierHandle {
         self.team.async_barrier()
     }
 
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index 17d136ff..569e67fe 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -2,7 +2,7 @@ use lamellar::array::prelude::*;
 
 macro_rules! initialize_array {
     (UnsafeArray,$array:ident,$init_val:ident) => {
-        let _ = unsafe {
+        unsafe {
             $array
                 .dist_iter_mut()
                 .for_each(move |x| *x = $init_val)

From b4f68d61458a4d71c5c7b7e7d4198e70e093cc11 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 25 Jul 2024 13:17:33 -0700
Subject: [PATCH 057/116] formatting

---
 examples/hello_world/hello_world_am.rs     |  3 -
 examples/team_examples/custom_team_arch.rs |  3 +-
 src/array/iterator/distributed_iterator.rs | 99 +++++++++++-----------
 src/array/iterator/local_iterator.rs       | 65 +++++++-------
 src/array/unsafe/iteration/distributed.rs  | 20 ++---
 src/array/unsafe/iteration/local.rs        | 15 ++--
 src/array/unsafe/operations.rs             |  3 +-
 7 files changed, 103 insertions(+), 105 deletions(-)

diff --git a/examples/hello_world/hello_world_am.rs b/examples/hello_world/hello_world_am.rs
index 301a5ef6..1e3ea685 100644
--- a/examples/hello_world/hello_world_am.rs
+++ b/examples/hello_world/hello_world_am.rs
@@ -37,6 +37,3 @@ fn main() {
     //wait for the request to complete
     world.block_on(request);
 } //when world drops there is an implicit world.barrier() that occurs
-
-
-    
\ No newline at end of file
diff --git a/examples/team_examples/custom_team_arch.rs b/examples/team_examples/custom_team_arch.rs
index 72d3c00f..74957329 100644
--- a/examples/team_examples/custom_team_arch.rs
+++ b/examples/team_examples/custom_team_arch.rs
@@ -79,7 +79,8 @@ impl LamellarArch for BlockStridedArch {
         let block = parent_pe / self.block_size;
         let start_block = self.start_pe / self.block_size;
         let remainder = parent_pe % self.block_size;
-        if block >= start_block && (block - start_block) % self.stride == 0
+        if block >= start_block
+            && (block - start_block) % self.stride == 0
             && self.start_pe <= *parent_pe
             && *parent_pe <= self.end_pe
         {
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index a0f86480..5d64b18c 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -36,14 +36,11 @@ use take::*;
 
 pub(crate) use consumer::*;
 
+use crate::active_messaging::SyncSend;
 use crate::array::iterator::{private::*, Schedule};
-use crate::array::{
-    operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray,
-    LamellarArray,
-};
+use crate::array::{operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray, LamellarArray};
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
-use crate::active_messaging::SyncSend;
 
 use futures_util::Future;
 use paste::paste;
@@ -126,37 +123,37 @@ pub trait DistIteratorLauncher: InnerArray {
         [()]
     );
     consumer_impl!(
-        for_each_async<I, F, Fut>(iter: &I, op: F); 
+        for_each_async<I, F, Fut>(iter: &I, op: F);
         [DistIterForEachHandle];
         [I: DistributedIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
         [()]);
 
     consumer_impl!(
-        reduce<I, F>(iter: &I, op: F); 
+        reduce<I, F>(iter: &I, op: F);
         [DistIterReduceHandle<I::Item, F>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
         [Option<I::Item>]);
 
     consumer_impl!(
-        collect<I, A>(iter: &I, d: Distribution); 
+        collect<I, A>(iter: &I, d: Distribution);
         [DistIterCollectHandle<I::Item, A>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static];
         [A]);
 
     consumer_impl!(
-        collect_async<I, A, B>(iter: &I, d: Distribution); 
+        collect_async<I, A, B>(iter: &I, d: Distribution);
         [DistIterCollectHandle<B, A>];
         [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
         [A]);
 
     consumer_impl!(
-        count<I>(iter: &I); 
+        count<I>(iter: &I);
         [DistIterCountHandle];
         [I: DistributedIterator + 'static ];
         [usize]);
 
     consumer_impl!(
-        sum<I>(iter: &I); 
+        sum<I>(iter: &I);
         [DistIterSumHandle<I::Item>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
         [I::Item]);
@@ -191,7 +188,6 @@ pub trait DistIteratorLauncher: InnerArray {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.as_inner().team()
     }
-
 }
 
 /// An interface for dealing with distributed iterators (intended as a parallel and distributed version of the standard iterator trait)
@@ -363,8 +359,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         Monotonic::new(self, 0)
     }
 
-    
-
     /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
@@ -402,7 +396,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
     // ///
     // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note 
+    // /// # Note
     // /// Calling this function launches the iteration regardless of if the returned future is used or not.
     // ///
     // /// # Examples
@@ -416,7 +410,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // ///     .dist_iter()
     // ///     .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
     // /// array.wait_all(); //wait for the iteration to complete
-    // /// 
+    // ///
     // ///```
     // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
     // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
@@ -448,7 +442,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // ///     .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     // /// );
     // ///```
-    // fn blocking_for_each<F>(&self, op: F) 
+    // fn blocking_for_each<F>(&self, op: F)
     // where
     //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
     // {
@@ -485,7 +479,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn for_each_async<F, Fut>(&self, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -503,7 +497,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// Each thread will only drive a single future at a time.
     // ///
     // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note 
+    // /// # Note
     // /// Calling this function launches the iteration regardless of if the returned future is used or not.
     // ///
     // /// # Examples
@@ -588,7 +582,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///
     /// array.block_on(array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id())));
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -601,7 +595,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// Calling this function invokes an implicit barrier across all PEs in the Array
     // ///
     // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note 
+    // /// # Note
     // /// Calling this function launches the iteration regardless of if the returned future is used or not.
     // ///
     // /// # Examples
@@ -672,7 +666,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// });
     /// array.block_on(iter);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> DistIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -691,7 +685,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// Each thread will only drive a single future at a time.
     // ///
     // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note 
+    // /// # Note
     // /// Calling this function launches the iteration regardless of if the returned future is used or not.
     // ///
     // /// # Examples
@@ -766,7 +760,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn reduce<F>(&self, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -781,7 +775,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
     // ///
     // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // /// # Note 
+    // /// # Note
     // /// Calling this function launches the iteration regardless of if the returned future is used or not.
     // /// # Examples
     // ///```
@@ -845,7 +839,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -928,7 +922,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn collect<A>(&self, d: Distribution) -> DistIterCollectHandle<Self::Item, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -971,7 +965,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     //     self.array().blocking_collect(self, d)
     // }
 
-    /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
     ///
@@ -996,17 +990,21 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
-    fn collect_with_schedule<A>(&self,sched: Schedule, d: Distribution) -> DistIterCollectHandle<Self::Item, A>
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    fn collect_with_schedule<A>(
+        &self,
+        sched: Schedule,
+        d: Distribution,
+    ) -> DistIterCollectHandle<Self::Item, A>
     where
         // &'static Self: DistributedIterator + 'static,
         Self::Item: Dist + ArrayOps,
         A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
     {
-        self.array().collect_with_schedule(sched,self,  d)
+        self.array().collect_with_schedule(sched, self, d)
     }
 
-    // /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    // /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
     // ///
     // /// Calling this function invokes an implicit barrier across all PEs in the Array.
     // ///
@@ -1075,7 +1073,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn collect_async<A, T>(&self, d: Distribution) -> DistIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -1130,7 +1128,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     //     self.array().blocking_collect_async(self, d)
     // }
 
-    /// Collects the awaited elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    /// Collects the awaited elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
     ///
@@ -1167,8 +1165,12 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
-    fn collect_async_with_schedule<A, T>(&self, sched: Schedule,   d: Distribution) -> DistIterCollectHandle<T, A>
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    fn collect_async_with_schedule<A, T>(
+        &self,
+        sched: Schedule,
+        d: Distribution,
+    ) -> DistIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
         T: Dist + ArrayOps,
@@ -1178,7 +1180,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_async_with_schedule(sched, self, d)
     }
 
-    // /// Collects the awaited elements of the distributed iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    // /// Collects the awaited elements of the distributed iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy
     // ///
     // /// Calling this function invokes an implicit barrier across all PEs in the Array.
     // ///
@@ -1223,7 +1225,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // }
 
     /// Counts the number of the elements of the distriubted iterator
-    /// 
+    ///
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve count.
@@ -1239,7 +1241,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().filter(|elem|  elem < 10).count();
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn count(&self) -> DistIterCountHandle {
         self.array().count(self)
     }
@@ -1247,7 +1249,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// Counts the number of the elements of the distributed iterator
     // ///
     // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    // /// 
+    // ///
     // /// This function returns the count upon completion.
     // ///
     // /// # Examples
@@ -1264,7 +1266,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // }
 
     /// Counts the number of the elements of the distriubted iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
-    /// 
+    ///
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// This function returns a future which needs to be driven to completion to retrieve count.
@@ -1284,11 +1286,10 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().count_with_schedule(sched, self)
     }
 
-
     // /// Counts the number of the elements of the distributed iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
     // ///
     // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    // /// 
+    // ///
     // /// This function returns the count upon completion.
     // ///
     // /// # Examples
@@ -1307,7 +1308,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Sums the elements of the distributed iterator.
     ///
     /// Takes each element, adds them together, and returns the result.
-    /// 
+    ///
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// An empty iterator returns the zero value of the type.
@@ -1325,7 +1326,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn sum(&self) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
@@ -1336,7 +1337,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// Sums the elements of the distributed iterator.
     // ///
     // /// Takes each element, adds them together, and returns the result.
-    // /// 
+    // ///
     // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     // ///
     // /// An empty iterator returns the zero value of the type.
@@ -1362,7 +1363,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Takes each element, adds them together, and returns the result.
-    /// 
+    ///
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
     /// An empty iterator returns the zero value of the type.
@@ -1380,7 +1381,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let req = array.dist_iter().sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn sum_with_schedule(&self, sched: Schedule) -> DistIterSumHandle<Self::Item>
     where
         Self::Item: Dist + ArrayOps + std::iter::Sum,
@@ -1391,7 +1392,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     // /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
     // ///
     // /// Takes each element, adds them together, and returns the result.
-    // /// 
+    // ///
     // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     // ///
     // /// An empty iterator returns the zero value of the type.
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 126d1e75..889f5a4d 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -35,9 +35,7 @@ use zip::*;
 pub(crate) use consumer::*;
 
 use crate::array::iterator::{private::*, Schedule};
-use crate::array::{
-    operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray, LamellarArray,
-};
+use crate::array::{operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray, LamellarArray};
 use crate::memregion::Dist;
 use crate::LamellarTeamRT;
 
@@ -45,10 +43,10 @@ use crate::active_messaging::SyncSend;
 
 use enum_dispatch::enum_dispatch;
 use futures_util::Future;
+use paste::paste;
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
-use paste::paste;
 
 macro_rules! consumer_impl {
     ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ; [$(-> $($blocking_ret:tt)*)? ]) => {
@@ -98,7 +96,6 @@ macro_rules! consumer_impl {
 #[doc(hidden)]
 #[enum_dispatch]
 pub trait LocalIteratorLauncher: InnerArray {
-
     consumer_impl!(
         for_each<I, F>(iter: &I, op: F);
         [LocalIterForEachHandle];
@@ -106,37 +103,37 @@ pub trait LocalIteratorLauncher: InnerArray {
         []
     );
     consumer_impl!(
-        for_each_async<I, F, Fut>(iter: &I, op: F); 
+        for_each_async<I, F, Fut>(iter: &I, op: F);
         [LocalIterForEachHandle];
         [I: LocalIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
         []);
 
     consumer_impl!(
-        reduce<I, F>(iter: &I, op: F); 
+        reduce<I, F>(iter: &I, op: F);
         [LocalIterReduceHandle<I::Item, F>];
         [I: LocalIterator + 'static, I::Item: SyncSend + Copy, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
         [-> Option<I::Item>]);
 
     consumer_impl!(
-        collect<I, A>(iter: &I, d: Distribution); 
+        collect<I, A>(iter: &I, d: Distribution);
         [LocalIterCollectHandle<I::Item, A>];
         [I: LocalIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static];
         [-> A]);
 
     consumer_impl!(
-        collect_async<I, A, B>(iter: &I, d: Distribution); 
+        collect_async<I, A, B>(iter: &I, d: Distribution);
         [LocalIterCollectHandle<B, A>];
         [I: LocalIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
         [-> A]);
 
     consumer_impl!(
-        count<I>(iter: &I); 
+        count<I>(iter: &I);
         [LocalIterCountHandle];
         [I: LocalIterator + 'static ];
         [-> usize]);
 
     consumer_impl!(
-        sum<I>(iter: &I); 
+        sum<I>(iter: &I);
         [LocalIterSumHandle<I::Item>];
         [I: LocalIterator + 'static, I::Item: SyncSend +  std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
         [-> I::Item]);
@@ -358,7 +355,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///         .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
     /// );
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each<F>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -379,11 +376,11 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     // /// let world = LamellarWorldBuilder::new().build();
     // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     // ///
-    // /// 
+    // ///
     // ///     array
     // ///         .local_iter()
     // ///         .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
-    // /// 
+    // ///
     // ///```
     // fn blocking_for_each<F>(&self, op: F)
     // where
@@ -408,7 +405,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// array.local_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
     /// array.wait_all();
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) + SyncSend + Clone + 'static,
@@ -429,7 +426,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     // ///
     // /// array.local_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
     // ///```
-    // fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F) 
+    // fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F)
     // where
     //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
     // {
@@ -467,7 +464,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///     fut.await;
     /// }
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_async<F, Fut>(&self, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -504,7 +501,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     // ///     fut.await;
     // /// }
     // ///```
-    // fn blocking_for_each_async<F, Fut>(&self, op: F) 
+    // fn blocking_for_each_async<F, Fut>(&self, op: F)
     // where
     //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
     //     Fut: Future<Output = ()> + Send + 'static,
@@ -512,7 +509,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     //     self.array().blocking_for_each_async(self, op)
     // }
 
-
     /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// The supplied closure must return a future.
@@ -536,7 +532,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// });
     /// array.wait_all();
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
     where
         F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
@@ -588,7 +584,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn reduce<F>(&self, op: F) -> LocalIterReduceHandle<Self::Item, F>
     where
         // &'static Self: LocalIterator + 'static,
@@ -635,7 +631,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn reduce_with_schedule<F>(
         &self,
         sched: Schedule,
@@ -691,7 +687,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn collect<A>(&self, d: Distribution) -> LocalIterCollectHandle<Self::Item, A>
     where
         // &'static Self: LocalIterator + 'static,
@@ -740,7 +736,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn collect_with_schedule<A>(
         &self,
         sched: Schedule,
@@ -819,7 +815,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn collect_async<A, T>(&self, d: Distribution) -> LocalIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
@@ -874,7 +870,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     //     self.array().blocking_collect_async(self, d)
     // }
 
-    /// Collects the awaited elements of the local iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    /// Collects the awaited elements of the local iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
     ///
@@ -911,8 +907,12 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
-    fn collect_async_with_schedule<A, T>(&self, sched: Schedule,   d: Distribution) -> LocalIterCollectHandle<T, A>
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    fn collect_async_with_schedule<A, T>(
+        &self,
+        sched: Schedule,
+        d: Distribution,
+    ) -> LocalIterCollectHandle<T, A>
     where
         // &'static Self: DistributedIterator + 'static,
         T: Dist + ArrayOps,
@@ -922,7 +922,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_async_with_schedule(sched, self, d)
     }
 
-    // /// Collects the awaited elements of the local iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy 
+    // /// Collects the awaited elements of the local iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy
     // ///
     // /// Calling this function invokes an implicit barrier across all PEs in the Array.
     // ///
@@ -981,7 +981,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().count();
     /// let cnt = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn count(&self) -> LocalIterCountHandle {
         self.array().count(self)
     }
@@ -1018,7 +1018,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().count_with_schedule(Schedule::Dynamic);
     /// let cnt = array.block_on(req);
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn count_with_schedule(&self, sched: Schedule) -> LocalIterCountHandle {
         self.array().count_with_schedule(sched, self)
     }
@@ -1040,7 +1040,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     //     self.array().blocking_count_with_schedule(sched, self)
     // }
 
-
     /// Sums the elements of the local iterator.
     ///
     /// Takes each element, adds them together, and returns the result.
@@ -1060,7 +1059,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let req = array.local_iter().sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
-     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
+    #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn sum(&self) -> LocalIterSumHandle<Self::Item>
     where
         Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index d8c4d751..ce281b56 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -1,9 +1,9 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::private::Sealed;
+use crate::array::iterator::Schedule;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, InnerArray};
-use crate::array::iterator::Schedule;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 
@@ -100,7 +100,7 @@ macro_rules! consumer_impl {
             //     if std::thread::current().id() != *crate::MAIN_THREAD {
             //         let name = stringify!{$name};
             //         let msg = format!("
-            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
+            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead!
             //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             //         );
             //         if let Some(val) = config().blocking_call_warning {
@@ -145,9 +145,9 @@ impl DistIteratorLauncher for UnsafeArrayInner {
             Some(self.subarray_index_from_local(index * chunk_size)? / chunk_size)
         }
     }
-    
+
     consumer_impl!(
-    for_each<I, F>(iter: &I, op: F); 
+    for_each<I, F>(iter: &I, op: F);
     [DistIterForEachHandle];
     [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
     [
@@ -159,7 +159,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
     [()]);
 
     consumer_impl!(
-        for_each_async<I, F, Fut>(iter: &I, op: F); 
+        for_each_async<I, F, Fut>(iter: &I, op: F);
         [DistIterForEachHandle];
         [I: DistributedIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
         [
@@ -172,7 +172,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
     );
 
     consumer_impl!(
-        reduce<I, F>( iter: &I, op: F); 
+        reduce<I, F>( iter: &I, op: F);
         [DistIterReduceHandle<I::Item, F>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
         [
@@ -184,7 +184,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         [Option<I::Item>]);
 
     consumer_impl!(
-        collect<I, A>( iter: &I, d: Distribution); 
+        collect<I, A>( iter: &I, d: Distribution);
         [DistIterCollectHandle<I::Item, A>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
         [
@@ -196,7 +196,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         ];
         [A]);
     consumer_impl!(
-        collect_async<I, A, B>( iter: &I, d: Distribution); 
+        collect_async<I, A, B>( iter: &I, d: Distribution);
         [DistIterCollectHandle<B, A>];
         [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
         [
@@ -209,7 +209,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         [A]);
 
     consumer_impl!(
-        count<I>( iter: &I); 
+        count<I>( iter: &I);
         [DistIterCountHandle];
         [I: DistributedIterator + 'static ];
         [
@@ -220,7 +220,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         [usize]);
 
     consumer_impl!(
-        sum<I>(iter: &I); 
+        sum<I>(iter: &I);
         [DistIterSumHandle<I::Item>];
         [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
         [
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index b2212ee8..53cc482f 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -10,9 +10,9 @@ use crate::memregion::Dist;
 
 use core::marker::PhantomData;
 use futures_util::Future;
+use paste::paste;
 use std::pin::Pin;
 use std::sync::Arc;
-use paste::paste;
 
 impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {}
 
@@ -68,7 +68,7 @@ macro_rules! consumer_impl {
             //     if std::thread::current().id() != *crate::MAIN_THREAD {
             //         let name = stringify!{$name};
             //         let msg = format!("
-            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead! 
+            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead!
             //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             //         );
             //         if let Some(val) = config().blocking_call_warning {
@@ -138,9 +138,8 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
         []
     );
 
-
     consumer_impl!(
-        reduce<I, F>( iter: &I, op: F); 
+        reduce<I, F>( iter: &I, op: F);
         [LocalIterReduceHandle<I::Item, F>];
         [I: LocalIterator + 'static, I::Item: SyncSend + Copy, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
         [
@@ -153,7 +152,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
     );
 
     consumer_impl!(
-        collect<I, A>( iter: &I, d: Distribution); 
+        collect<I, A>( iter: &I, d: Distribution);
         [LocalIterCollectHandle<I::Item, A>];
         [I: LocalIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
         [
@@ -167,7 +166,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
     );
 
     consumer_impl!(
-        collect_async<I, A, B>( iter: &I, d: Distribution); 
+        collect_async<I, A, B>( iter: &I, d: Distribution);
         [LocalIterCollectHandle<B, A>];
         [I: LocalIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
         [
@@ -181,7 +180,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
     );
 
     consumer_impl!(
-        count<I>( iter: &I); 
+        count<I>( iter: &I);
         [LocalIterCountHandle];
         [I: LocalIterator + 'static ];
         [
@@ -193,7 +192,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
     );
 
     consumer_impl!(
-        sum<I>(iter: &I); 
+        sum<I>(iter: &I);
         [LocalIterSumHandle<I::Item>];
         [I: LocalIterator + 'static, I::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
         [
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 0ef8d9ee..d43087f4 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -687,7 +687,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             IndexSize::U64 => std::mem::size_of::<IdxVal<u64, T>>(),
             IndexSize::Usize => std::mem::size_of::<IdxVal<usize, T>>(),
         };
-        let num_per_batch = (config().am_size_threshold as f32 / idx_val_bytes as f32).ceil() as usize;
+        let num_per_batch =
+            (config().am_size_threshold as f32 / idx_val_bytes as f32).ceil() as usize;
         let bytes_per_batch = num_per_batch * idx_val_bytes;
 
         let num_pes = self.inner.data.team.num_pes();

From ec59bdd502c4ca6f38fd0112bee92c2810573b99 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 25 Jul 2024 13:33:20 -0700
Subject: [PATCH 058/116] formatting

---
 src/active_messaging/handle.rs | 26 +++++++++++++++++++++++++-
 src/array.rs                   | 11 +++++++++++
 src/array/handle.rs            | 25 +++++++++++++++++++++++--
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index 55991064..b70aef62 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -17,7 +17,7 @@ use crate::{
     lamellae::Des,
     lamellar_request::{InternalResult, LamellarRequest, LamellarRequestAddResult},
     memregion::one_sided::MemRegionHandleInner,
-    scheduler::Scheduler,
+    scheduler::{LamellarTask, Scheduler},
     Darc, LamellarArchRT,
 };
 
@@ -122,6 +122,14 @@ impl<T: AmDist> AmHandle<T> {
             }
         }
     }
+
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.inner.scheduler.spawn_task(self)
+    }
+
+    pub fn block(self) -> T {
+        self.inner.scheduler.block_on(self)
+    }
 }
 
 impl<T: AmDist> LamellarRequest for AmHandle<T> {
@@ -209,6 +217,14 @@ impl<T: 'static> LocalAmHandle<T> {
             }
         }
     }
+
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.inner.scheduler.spawn_task(self)
+    }
+
+    pub fn block(self) -> T {
+        self.inner.scheduler.block_on(self)
+    }
 }
 
 impl<T: AmDist> From<LocalAmHandle<T>> for AmHandle<T> {
@@ -364,6 +380,14 @@ impl<T: AmDist> MultiAmHandle<T> {
             }
         }
     }
+
+    pub fn spawn(self) -> LamellarTask<Vec<T>> {
+        self.inner.scheduler.spawn_task(self)
+    }
+
+    pub fn block(self) -> Vec<T> {
+        self.inner.scheduler.block_on(self)
+    }
 }
 
 impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
diff --git a/src/array.rs b/src/array.rs
index e6eba6de..e82663ec 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -641,6 +641,17 @@ impl LamellarByteArray {
             LamellarByteArray::GlobalLockArray(_) => std::any::TypeId::of::<GlobalLockByteArray>(),
         }
     }
+    pub(crate) fn team(&self) -> &Pin<Arc<LamellarTeamRT>> {
+        match self {
+            LamellarByteArray::UnsafeArray(array) => array.team(),
+            LamellarByteArray::ReadOnlyArray(array) => array.team(),
+            LamellarByteArray::AtomicArray(array) => array.team(),
+            LamellarByteArray::NativeAtomicArray(array) => array.team(),
+            LamellarByteArray::GenericAtomicArray(array) => array.team(),
+            LamellarByteArray::LocalLockArray(array) => array.team(),
+            LamellarByteArray::GlobalLockArray(array) => array.team(),
+        }
+    }
 }
 
 impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarReadArray<T> {
diff --git a/src/array/handle.rs b/src/array/handle.rs
index 210058d1..56eae47d 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -11,15 +11,26 @@ use crate::{
     active_messaging::{AmHandle, LocalAmHandle},
     array::LamellarByteArray,
     lamellar_request::LamellarRequest,
+    scheduler::LamellarTask,
     Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
 };
 
 /// a task handle for an array rdma (put/get) operation
 pub struct ArrayRdmaHandle {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<AmHandle<()>>,
 }
 
+impl ArrayRdmaHandle {
+    pub fn spawn(self) -> LamellarTask<()> {
+        self.array.team().spawn_task(self)
+    }
+
+    pub fn block(self) -> () {
+        self.array.team().block_on(self)
+    }
+}
+
 impl LamellarRequest for ArrayRdmaHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(0..) {
@@ -56,11 +67,21 @@ impl Future for ArrayRdmaHandle {
 /// a task handle for an array rdma 'at' operation
 #[pin_project]
 pub struct ArrayRdmaAtHandle<T: Dist> {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: Option<LocalAmHandle<()>>,
     pub(crate) buf: OneSidedMemoryRegion<T>,
 }
 
+impl<T: Dist> ArrayRdmaAtHandle<T> {
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.array.team().spawn_task(self)
+    }
+
+    pub fn block(self) -> T {
+        self.array.team().block_on(self)
+    }
+}
+
 impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
     fn blocking_wait(self) -> Self::Output {
         match self.req {

From d562b8239883cd72caf1a69d97f97e38922aa2ba Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 25 Jul 2024 13:33:46 -0700
Subject: [PATCH 059/116] Revert "formatting"

This reverts commit ec59bdd502c4ca6f38fd0112bee92c2810573b99.
---
 src/active_messaging/handle.rs | 26 +-------------------------
 src/array.rs                   | 11 -----------
 src/array/handle.rs            | 25 ++-----------------------
 3 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index b70aef62..55991064 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -17,7 +17,7 @@ use crate::{
     lamellae::Des,
     lamellar_request::{InternalResult, LamellarRequest, LamellarRequestAddResult},
     memregion::one_sided::MemRegionHandleInner,
-    scheduler::{LamellarTask, Scheduler},
+    scheduler::Scheduler,
     Darc, LamellarArchRT,
 };
 
@@ -122,14 +122,6 @@ impl<T: AmDist> AmHandle<T> {
             }
         }
     }
-
-    pub fn spawn(self) -> LamellarTask<T> {
-        self.inner.scheduler.spawn_task(self)
-    }
-
-    pub fn block(self) -> T {
-        self.inner.scheduler.block_on(self)
-    }
 }
 
 impl<T: AmDist> LamellarRequest for AmHandle<T> {
@@ -217,14 +209,6 @@ impl<T: 'static> LocalAmHandle<T> {
             }
         }
     }
-
-    pub fn spawn(self) -> LamellarTask<T> {
-        self.inner.scheduler.spawn_task(self)
-    }
-
-    pub fn block(self) -> T {
-        self.inner.scheduler.block_on(self)
-    }
 }
 
 impl<T: AmDist> From<LocalAmHandle<T>> for AmHandle<T> {
@@ -380,14 +364,6 @@ impl<T: AmDist> MultiAmHandle<T> {
             }
         }
     }
-
-    pub fn spawn(self) -> LamellarTask<Vec<T>> {
-        self.inner.scheduler.spawn_task(self)
-    }
-
-    pub fn block(self) -> Vec<T> {
-        self.inner.scheduler.block_on(self)
-    }
 }
 
 impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
diff --git a/src/array.rs b/src/array.rs
index e82663ec..e6eba6de 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -641,17 +641,6 @@ impl LamellarByteArray {
             LamellarByteArray::GlobalLockArray(_) => std::any::TypeId::of::<GlobalLockByteArray>(),
         }
     }
-    pub(crate) fn team(&self) -> &Pin<Arc<LamellarTeamRT>> {
-        match self {
-            LamellarByteArray::UnsafeArray(array) => array.team(),
-            LamellarByteArray::ReadOnlyArray(array) => array.team(),
-            LamellarByteArray::AtomicArray(array) => array.team(),
-            LamellarByteArray::NativeAtomicArray(array) => array.team(),
-            LamellarByteArray::GenericAtomicArray(array) => array.team(),
-            LamellarByteArray::LocalLockArray(array) => array.team(),
-            LamellarByteArray::GlobalLockArray(array) => array.team(),
-        }
-    }
 }
 
 impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarReadArray<T> {
diff --git a/src/array/handle.rs b/src/array/handle.rs
index 56eae47d..210058d1 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -11,26 +11,15 @@ use crate::{
     active_messaging::{AmHandle, LocalAmHandle},
     array::LamellarByteArray,
     lamellar_request::LamellarRequest,
-    scheduler::LamellarTask,
     Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
 };
 
 /// a task handle for an array rdma (put/get) operation
 pub struct ArrayRdmaHandle {
-    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<AmHandle<()>>,
 }
 
-impl ArrayRdmaHandle {
-    pub fn spawn(self) -> LamellarTask<()> {
-        self.array.team().spawn_task(self)
-    }
-
-    pub fn block(self) -> () {
-        self.array.team().block_on(self)
-    }
-}
-
 impl LamellarRequest for ArrayRdmaHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(0..) {
@@ -67,21 +56,11 @@ impl Future for ArrayRdmaHandle {
 /// a task handle for an array rdma 'at' operation
 #[pin_project]
 pub struct ArrayRdmaAtHandle<T: Dist> {
-    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: Option<LocalAmHandle<()>>,
     pub(crate) buf: OneSidedMemoryRegion<T>,
 }
 
-impl<T: Dist> ArrayRdmaAtHandle<T> {
-    pub fn spawn(self) -> LamellarTask<T> {
-        self.array.team().spawn_task(self)
-    }
-
-    pub fn block(self) -> T {
-        self.array.team().block_on(self)
-    }
-}
-
 impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
     fn blocking_wait(self) -> Self::Output {
         match self.req {

From 481772a51c2ef32f9bf2e304253658e2c1f41981 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 25 Jul 2024 22:31:00 -0700
Subject: [PATCH 060/116] enforcing additional type checks on AM return types

---
 .../active_message_examples/am_batch_tests.rs | 308 ++++----
 examples/active_message_examples/am_local.rs  |  26 +-
 .../am_local_memregions.rs                    |  60 +-
 examples/array_examples/array_am.rs           |   2 +-
 examples/array_examples/array_batch_add.rs    |  36 +-
 examples/array_examples/array_ops.rs          | 146 ++--
 .../array_examples/atomic_compare_exchange.rs |   3 +-
 examples/array_examples/dist_array_reduce.rs  |   2 +-
 examples/bandwidths/am_bw.rs                  |   2 +-
 examples/bandwidths/am_bw_get.rs              |   6 +-
 examples/bandwidths/atomic_array_get_bw.rs    |   2 +-
 examples/bandwidths/atomic_array_put_bw.rs    |   2 +-
 .../global_lock_atomic_array_get_bw.rs        |   2 +-
 .../global_lock_atomic_array_put_bw.rs        |   2 +-
 .../local_lock_atomic_array_get_bw.rs         |   2 +-
 .../local_lock_atomic_array_put_bw.rs         |   2 +-
 examples/bandwidths/readonly_array_get_bw.rs  |   2 +-
 examples/bandwidths/task_group_am_bw.rs       |   4 +-
 examples/bandwidths/unsafe_array_get_bw.rs    |   2 +-
 examples/bandwidths/unsafe_array_put_bw.rs    |   2 +-
 examples/bandwidths/unsafe_array_store_bw.rs  |   2 +-
 examples/darc_examples/darc.rs                |   4 +-
 examples/darc_examples/stress_test.rs         |  60 +-
 examples/hello_world/hello_world_array.rs     |   2 +-
 examples/kernels/dft_proxy.rs                 |  42 +-
 examples/kernels/parallel_array_gemm.rs       |   2 +-
 examples/kernels/serial_array_gemm.rs         |   2 +-
 examples/misc/ping_pong.rs                    |  34 +-
 examples/rdma_examples/rdma_am.rs             |  12 +-
 examples/team_examples/custom_team_arch.rs    |   4 +-
 examples/team_examples/random_team.rs         |   2 +-
 examples/team_examples/team_am.rs             |  20 +-
 impl/src/gen_am.rs                            |  26 +-
 run_examples.sh                               |   2 +-
 src/active_messaging.rs                       |  17 +-
 src/active_messaging/handle.rs                |  42 +-
 src/array.rs                                  | 134 +---
 src/array/atomic.rs                           | 283 +-------
 src/array/generic_atomic/rdma.rs              |  10 +-
 src/array/global_lock_atomic.rs               | 210 +-----
 src/array/global_lock_atomic/rdma.rs          |  10 +-
 src/array/handle.rs                           |  37 +-
 src/array/iterator/distributed_iterator.rs    | 669 +-----------------
 .../distributed_iterator/consumer/collect.rs  |  10 +-
 .../distributed_iterator/consumer/count.rs    |   8 +
 .../distributed_iterator/consumer/for_each.rs |   7 +
 .../distributed_iterator/consumer/reduce.rs   |   7 +
 .../distributed_iterator/consumer/sum.rs      |   7 +
 src/array/iterator/local_iterator.rs          | 423 +----------
 .../local_iterator/consumer/collect.rs        |   9 +-
 .../iterator/local_iterator/consumer/count.rs |   7 +
 .../local_iterator/consumer/for_each.rs       |   7 +
 .../local_iterator/consumer/reduce.rs         |   8 +
 .../iterator/local_iterator/consumer/sum.rs   |   8 +-
 src/array/local_lock_atomic.rs                | 201 +-----
 src/array/local_lock_atomic/rdma.rs           |  10 +-
 src/array/native_atomic/rdma.rs               |  10 +-
 src/array/operations/handle.rs                |  99 ++-
 src/array/read_only.rs                        | 166 +----
 src/array/unsafe.rs                           | 199 +-----
 src/array/unsafe/iteration/distributed.rs     | 166 ++---
 src/array/unsafe/iteration/local.rs           |  80 +--
 src/array/unsafe/operations.rs                |   2 +-
 src/array/unsafe/rdma.rs                      |  10 +-
 src/lamellar_task_group.rs                    |  41 ++
 src/lamellar_team.rs                          |   9 +
 src/scheduler.rs                              |   7 +-
 tests/array/arithmetic_ops/add_test.rs        |  42 +-
 tests/array/arithmetic_ops/div_test.rs        |   6 +-
 tests/array/arithmetic_ops/mul_test.rs        |   6 +-
 tests/array/arithmetic_ops/sub_test.rs        |  12 +-
 tests/array/atomic_ops/load_store_test.rs     |   6 +-
 tests/array/bitwise_ops/and_test.rs           |   6 +-
 tests/array/bitwise_ops/or_test.rs            |   6 +-
 tests/array/bitwise_ops/xor_test.rs           |   6 +-
 tests/array/rdma/get_test.rs                  |   6 +-
 tests/array/rdma/put_test.rs                  |   6 +-
 77 files changed, 1067 insertions(+), 2775 deletions(-)

diff --git a/examples/active_message_examples/am_batch_tests.rs b/examples/active_message_examples/am_batch_tests.rs
index 9fb93b2a..de12c2e0 100644
--- a/examples/active_message_examples/am_batch_tests.rs
+++ b/examples/active_message_examples/am_batch_tests.rs
@@ -142,91 +142,111 @@ fn main() {
             // let am_type = 7;
             match am_type {
                 0 => {
-                    let _ = world.exec_am_all(AmEmpty {});
+                    let _ = world.exec_am_all(AmEmpty {}).spawn();
                     cnts[0] += 1;
                 } //batch msg ,batch unit return
                 1 => {
-                    let _ = world.exec_am_all(AmEmptyReturnAmEmpty {});
+                    let _ = world.exec_am_all(AmEmptyReturnAmEmpty {}).spawn();
                     cnts[1] += 1;
                 } //batch msg, batch return am
                 2 => {
-                    let _ = world.exec_am_all(AmNoReturn {
-                        my_pe: my_pe,
-                        index: i,
-                        data: vec![i; 1],
-                    });
+                    let _ = world
+                        .exec_am_all(AmNoReturn {
+                            my_pe: my_pe,
+                            index: i,
+                            data: vec![i; 1],
+                        })
+                        .spawn();
                     cnts[2] += 1;
                 } //batch msg ,batch unit return
                 3 => {
-                    let _ = world.exec_am_all(AmNoReturn {
-                        my_pe: my_pe,
-                        index: i,
-                        data: vec![i; len1],
-                    });
+                    let _ = world
+                        .exec_am_all(AmNoReturn {
+                            my_pe: my_pe,
+                            index: i,
+                            data: vec![i; len1],
+                        })
+                        .spawn();
                     cnts[3] += 1;
                 } //direct msg , batch unit return
                 4 => {
-                    let _ = world.exec_am_all(AmReturnVec {
-                        my_pe: my_pe,
-                        vec_size: 1,
-                        data: vec![i; 1],
-                    });
+                    let _ = world
+                        .exec_am_all(AmReturnVec {
+                            my_pe: my_pe,
+                            vec_size: 1,
+                            data: vec![i; 1],
+                        })
+                        .spawn();
                     cnts[4] += 1;
                 } //batch message, batch return
                 5 => {
-                    let _ = world.exec_am_all(AmReturnVec {
-                        my_pe: my_pe,
-                        vec_size: 1,
-                        data: vec![i; len1],
-                    });
+                    let _ = world
+                        .exec_am_all(AmReturnVec {
+                            my_pe: my_pe,
+                            vec_size: 1,
+                            data: vec![i; len1],
+                        })
+                        .spawn();
                     cnts[5] += 1;
                 } //direct msg, batch return
                 6 => {
-                    let _ = world.exec_am_all(AmReturnVec {
-                        my_pe: my_pe,
-                        vec_size: 100000,
-                        data: vec![i; 1],
-                    });
+                    let _ = world
+                        .exec_am_all(AmReturnVec {
+                            my_pe: my_pe,
+                            vec_size: 100000,
+                            data: vec![i; 1],
+                        })
+                        .spawn();
                     cnts[6] += 1;
                 } //batch message, direct return
                 7 => {
-                    let _ = world.exec_am_all(AmReturnVec {
-                        my_pe: my_pe,
-                        vec_size: 100000,
-                        data: vec![i; len1],
-                    });
+                    let _ = world
+                        .exec_am_all(AmReturnVec {
+                            my_pe: my_pe,
+                            vec_size: 100000,
+                            data: vec![i; len1],
+                        })
+                        .spawn();
                     cnts[7] += 1;
                 } //direct msg, direct return
                 8 => {
-                    let _ = world.exec_am_all(InitialAMVec {
-                        val1: 1,
-                        val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                        vec: vec![i; 1],
-                    });
+                    let _ = world
+                        .exec_am_all(InitialAMVec {
+                            val1: 1,
+                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                            vec: vec![i; 1],
+                        })
+                        .spawn();
                     cnts[8] += 1;
                 } //batch msg ,batch return
                 9 => {
-                    let _ = world.exec_am_all(InitialAMVec {
-                        val1: 1,
-                        val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                        vec: vec![i; len1],
-                    });
+                    let _ = world
+                        .exec_am_all(InitialAMVec {
+                            val1: 1,
+                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                            vec: vec![i; len1],
+                        })
+                        .spawn();
                     cnts[9] += 1;
                 } //direct msg , batch return
                 10 => {
-                    let _ = world.exec_am_all(InitialAMVec {
-                        val1: 100000,
-                        val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                        vec: vec![i; 1],
-                    });
+                    let _ = world
+                        .exec_am_all(InitialAMVec {
+                            val1: 100000,
+                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                            vec: vec![i; 1],
+                        })
+                        .spawn();
                     cnts[10] += 1;
                 } //batch message, direct return
                 _ => {
-                    let _ = world.exec_am_all(InitialAMVec {
-                        val1: 100000,
-                        val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                        vec: vec![i; len1],
-                    });
+                    let _ = world
+                        .exec_am_all(InitialAMVec {
+                            val1: 100000,
+                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                            vec: vec![i; len1],
+                        })
+                        .spawn();
                     cnts[11] += 1;
                 } //direct msg, direct return
             }
@@ -235,121 +255,141 @@ fn main() {
             // let am_type = 7;
             match am_type {
                 0 => {
-                    let _ = world.exec_am_pe(pe, AmEmpty {});
+                    let _ = world.exec_am_pe(pe, AmEmpty {}).spawn();
                     cnts[0] += 1;
                 } //batch msg ,batch unit return
                 1 => {
-                    let _ = world.exec_am_pe(pe, AmEmptyReturnAmEmpty {});
+                    let _ = world.exec_am_pe(pe, AmEmptyReturnAmEmpty {}).spawn();
                     cnts[1] += 1;
                 } //batch msg, batch return am
                 2 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        AmNoReturn {
-                            my_pe: my_pe,
-                            index: i,
-                            data: vec![i; 1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            AmNoReturn {
+                                my_pe: my_pe,
+                                index: i,
+                                data: vec![i; 1],
+                            },
+                        )
+                        .spawn();
                     cnts[2] += 1;
                 } //batch msg ,batch unit return
                 3 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        AmNoReturn {
-                            my_pe: my_pe,
-                            index: i,
-                            data: vec![i; len1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            AmNoReturn {
+                                my_pe: my_pe,
+                                index: i,
+                                data: vec![i; len1],
+                            },
+                        )
+                        .spawn();
                     cnts[3] += 1;
                 } //direct msg , batch unit return
                 4 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        AmReturnVec {
-                            my_pe: my_pe,
-                            vec_size: 1,
-                            data: vec![i; 1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            AmReturnVec {
+                                my_pe: my_pe,
+                                vec_size: 1,
+                                data: vec![i; 1],
+                            },
+                        )
+                        .spawn();
                     cnts[4] += 1;
                 } //batch message, batch return
                 5 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        AmReturnVec {
-                            my_pe: my_pe,
-                            vec_size: 1,
-                            data: vec![i; len1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            AmReturnVec {
+                                my_pe: my_pe,
+                                vec_size: 1,
+                                data: vec![i; len1],
+                            },
+                        )
+                        .spawn();
                     cnts[5] += 1;
                 } //direct msg, batch return
                 6 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        AmReturnVec {
-                            my_pe: my_pe,
-                            vec_size: len2,
-                            data: vec![i; 1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            AmReturnVec {
+                                my_pe: my_pe,
+                                vec_size: len2,
+                                data: vec![i; 1],
+                            },
+                        )
+                        .spawn();
                     cnts[6] += 1;
                 } //batch message, direct return
                 7 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        AmReturnVec {
-                            my_pe: my_pe,
-                            vec_size: len2,
-                            data: vec![i; len1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            AmReturnVec {
+                                my_pe: my_pe,
+                                vec_size: len2,
+                                data: vec![i; len1],
+                            },
+                        )
+                        .spawn();
                     cnts[7] += 1;
                 } //direct msg, direct return
                 8 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        InitialAMVec {
-                            val1: 1,
-                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                            vec: vec![i; 1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            InitialAMVec {
+                                val1: 1,
+                                val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                                vec: vec![i; 1],
+                            },
+                        )
+                        .spawn();
                     cnts[8] += 1;
                 } //batch msg ,batch return
                 9 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        InitialAMVec {
-                            val1: 1,
-                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                            vec: vec![i; len1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            InitialAMVec {
+                                val1: 1,
+                                val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                                vec: vec![i; len1],
+                            },
+                        )
+                        .spawn();
                     cnts[9] += 1;
                 } //direct msg , batch return
                 10 => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        InitialAMVec {
-                            val1: len2,
-                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                            vec: vec![i; 1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            InitialAMVec {
+                                val1: len2,
+                                val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                                vec: vec![i; 1],
+                            },
+                        )
+                        .spawn();
                     cnts[10] += 1;
                 } //batch message, direct return
                 _ => {
-                    let _ = world.exec_am_pe(
-                        pe,
-                        InitialAMVec {
-                            val1: len2,
-                            val2: hostname::get().unwrap().to_string_lossy().to_string(),
-                            vec: vec![i; len1],
-                        },
-                    );
+                    let _ = world
+                        .exec_am_pe(
+                            pe,
+                            InitialAMVec {
+                                val1: len2,
+                                val2: hostname::get().unwrap().to_string_lossy().to_string(),
+                                vec: vec![i; len1],
+                            },
+                        )
+                        .spawn();
                     cnts[11] += 1;
                 } //direct msg, direct return
             }
diff --git a/examples/active_message_examples/am_local.rs b/examples/active_message_examples/am_local.rs
index c38e8b99..f7ca8240 100644
--- a/examples/active_message_examples/am_local.rs
+++ b/examples/active_message_examples/am_local.rs
@@ -64,7 +64,7 @@ impl LamellarAM for AmReturnUsize {
             std::thread::sleep(Duration::from_millis(1000));
             i = self.index.fetch_add(1, Ordering::Relaxed);
         }
-        println!("\t{:?} leaving, sum{:?}", self.my_id, sum);
+        println!("\t{:?} leaving, sum{:?}", self.my_id, sum,);
         sum
     }
 }
@@ -88,22 +88,26 @@ fn main() {
         println!("---------------------------------------------------------------");
         println!("Testing local am no return");
         for i in 0..map.len() {
-            let _ = world.exec_am_local(AmNoReturn {
-                my_id: i,
-                data: map.clone(),
-                index: index.clone(),
-            });
+            let _ = world
+                .exec_am_local(AmNoReturn {
+                    my_id: i,
+                    data: map.clone(),
+                    index: index.clone(),
+                })
+                .spawn();
         }
         world.wait_all();
         println!("-----------------------------------");
         println!("---------------------------------------------------------------");
         println!("Testing local am no return");
         for i in 0..map.len() {
-            let _ = world.exec_am_local(AmReturnUsize {
-                my_id: i,
-                data: map.clone(),
-                index: index.clone(),
-            });
+            let _ = world
+                .exec_am_local(AmReturnUsize {
+                    my_id: i,
+                    data: map.clone(),
+                    index: index.clone(),
+                })
+                .spawn();
         }
         world.wait_all();
         println!("-----------------------------------");
diff --git a/examples/active_message_examples/am_local_memregions.rs b/examples/active_message_examples/am_local_memregions.rs
index 5733c8c7..36e1a2bf 100644
--- a/examples/active_message_examples/am_local_memregions.rs
+++ b/examples/active_message_examples/am_local_memregions.rs
@@ -30,15 +30,17 @@ impl LamellarAM for DataAM {
             for _i in 0..self.width {
                 let pe = pes.sample(&mut rng);
                 // println!("sending {:?} to {:?}",path,pe);
-                let _ = lamellar::team.exec_am_pe(
-                    pe,
-                    DataAM {
-                        array: self.array.clone(),
-                        depth: self.depth - 1,
-                        width: self.width,
-                        path: path.clone(),
-                    },
-                );
+                let _ = lamellar::team
+                    .exec_am_pe(
+                        pe,
+                        DataAM {
+                            array: self.array.clone(),
+                            depth: self.depth - 1,
+                            width: self.width,
+                            path: path.clone(),
+                        },
+                    )
+                    .spawn();
             }
         }
     }
@@ -86,24 +88,28 @@ fn main() {
     let width = 5;
     for _i in 0..width {
         let pe = pes.sample(&mut rng) / 2; //since both teams consist of half the number of pes as the world
-        let _ = first_half_team.exec_am_pe(
-            pe,
-            DataAM {
-                array: array.clone(),
-                depth: 5,
-                width: width,
-                path: vec![my_pe],
-            },
-        );
-        let _ = odd_team.exec_am_pe(
-            pe,
-            DataAM {
-                array: array.clone(),
-                depth: 5,
-                width: width,
-                path: vec![my_pe],
-            },
-        );
+        let _ = first_half_team
+            .exec_am_pe(
+                pe,
+                DataAM {
+                    array: array.clone(),
+                    depth: 5,
+                    width: width,
+                    path: vec![my_pe],
+                },
+            )
+            .spawn();
+        let _ = odd_team
+            .exec_am_pe(
+                pe,
+                DataAM {
+                    array: array.clone(),
+                    depth: 5,
+                    width: width,
+                    path: vec![my_pe],
+                },
+            )
+            .spawn();
     }
     world.wait_all();
     world.barrier();
diff --git a/examples/array_examples/array_am.rs b/examples/array_examples/array_am.rs
index 145824d2..a101d056 100644
--- a/examples/array_examples/array_am.rs
+++ b/examples/array_examples/array_am.rs
@@ -106,7 +106,7 @@ fn main() {
             array: array.clone(),
             orig_pe: my_pe,
             index: index,
-        });
+        }).spawn();
         index += 1;
     }
 
diff --git a/examples/array_examples/array_batch_add.rs b/examples/array_examples/array_batch_add.rs
index 698e28f1..ab054043 100644
--- a/examples/array_examples/array_batch_add.rs
+++ b/examples/array_examples/array_batch_add.rs
@@ -38,7 +38,7 @@ fn main() {
 
         array.barrier();
         let timer = std::time::Instant::now();
-        let _ = array.batch_add(indices.clone(), 1);
+        let _ = array.batch_add(indices.clone(), 1).spawn();
         if my_pe == 0 {
             println!("{:?}", timer.elapsed());
         }
@@ -53,7 +53,7 @@ fn main() {
 
         array.barrier();
         let mut timer = std::time::Instant::now();
-        let _ = array.batch_add(indices.clone(), 1);
+        let _ = array.batch_add(indices.clone(), 1).spawn();
         if my_pe == 0 {
             println!("{:?}", timer.elapsed());
         }
@@ -74,24 +74,28 @@ fn main() {
             if bufs[pe].len() == num_per_batch {
                 let mut buf = Vec::with_capacity(num_per_batch);
                 std::mem::swap(&mut bufs[pe], &mut buf);
-                let _ = world.exec_am_pe(
-                    pe,
-                    AddAm {
-                        array: array.clone(),
-                        indices: buf,
-                    },
-                );
+                let _ = world
+                    .exec_am_pe(
+                        pe,
+                        AddAm {
+                            array: array.clone(),
+                            indices: buf,
+                        },
+                    )
+                    .spawn();
             }
         }
         for (pe, buf) in bufs.drain(..).enumerate() {
             if buf.len() > 0 {
-                let _ = world.exec_am_pe(
-                    pe,
-                    AddAm {
-                        array: array.clone(),
-                        indices: buf,
-                    },
-                );
+                let _ = world
+                    .exec_am_pe(
+                        pe,
+                        AddAm {
+                            array: array.clone(),
+                            indices: buf,
+                        },
+                    )
+                    .spawn();
             }
         }
         if my_pe == 0 {
diff --git a/examples/array_examples/array_ops.rs b/examples/array_examples/array_ops.rs
index f0e952f7..af52afa4 100644
--- a/examples/array_examples/array_ops.rs
+++ b/examples/array_examples/array_ops.rs
@@ -92,7 +92,7 @@ fn test_add<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.add(i, add_val);
+        let _ = array.add(i, add_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -129,7 +129,7 @@ fn test_sub<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.sub(i, sub_val);
+        let _ = array.sub(i, sub_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -160,7 +160,7 @@ fn test_mul<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.mul(i, mul_val);
+        let _ = array.mul(i, mul_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -192,7 +192,7 @@ fn test_div<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.div(i, div_val);
+        let _ = array.div(i, div_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -224,7 +224,7 @@ fn test_rem<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.rem(i, rem_val);
+        let _ = array.rem(i, rem_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -256,7 +256,7 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.bit_and(i, and_val);
+        let _ = array.bit_and(i, and_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -294,7 +294,7 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.bit_or(i, or_val);
+        let _ = array.bit_or(i, or_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -332,7 +332,7 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.bit_xor(i, xor_val);
+        let _ = array.bit_xor(i, xor_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -372,7 +372,7 @@ fn test_store_load<T: std::fmt::Debug + ElementOps + 'static>(
     array.print();
     array.barrier();
     for i in (my_pe..array.len()).step_by(num_pes) {
-        let _ = array.store(i, store_val);
+        let _ = array.store(i, store_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -405,7 +405,7 @@ fn test_shl<T: std::fmt::Debug + ElementShiftOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.shl(i, shl_val);
+        let _ = array.shl(i, shl_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -437,7 +437,7 @@ fn test_shr<T: std::fmt::Debug + ElementShiftOps + 'static>(
     array.print();
     array.barrier();
     for i in 0..array.len() {
-        let _ = array.shr(i, shr_val);
+        let _ = array.shr(i, shr_val).spawn();
     }
     array.wait_all();
     array.barrier();
@@ -475,25 +475,27 @@ fn main() {
         Custom { int: 0, float: 0.0 },
         Custom { int: 1, float: 1.0 },
     );
-    let _ = (&array_u8).add(3, 1);
+    let _ = (&array_u8).add(3, 1).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).add(3, 1);
+    let _ = (&array_i128).add(3, 1).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_f64).add(3, 1.0);
+    let _ = (&array_f64).add(3, 1.0).spawn();
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    let _ = (&array_custom).add(3, Custom { int: 1, float: 1.0 });
+    let _ = (&array_custom)
+        .add(3, Custom { int: 1, float: 1.0 })
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -511,25 +513,27 @@ fn main() {
         },
         Custom { int: 1, float: 1.0 },
     );
-    let _ = (&array_u8).sub(3, 1);
+    let _ = (&array_u8).sub(3, 1).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).sub(3, -1);
+    let _ = (&array_i128).sub(3, -1).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_f64).sub(3, 1.0);
+    let _ = (&array_f64).sub(3, 1.0).spawn();
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    let _ = (&array_custom).sub(3, Custom { int: 1, float: 1.0 });
+    let _ = (&array_custom)
+        .sub(3, Custom { int: 1, float: 1.0 })
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -545,25 +549,27 @@ fn main() {
         Custom { int: 1, float: 1.0 },
         Custom { int: 2, float: 2.5 },
     );
-    let _ = (&array_u8).mul(3, 2);
+    let _ = (&array_u8).mul(3, 2).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).mul(3, -2);
+    let _ = (&array_i128).mul(3, -2).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_f64).mul(3, 2.5);
+    let _ = (&array_f64).mul(3, 2.5).spawn();
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    let _ = (&array_custom).mul(3, Custom { int: 1, float: 2.5 });
+    let _ = (&array_custom)
+        .mul(3, Custom { int: 1, float: 2.5 })
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -582,25 +588,27 @@ fn main() {
         },
         Custom { int: 2, float: 2.5 },
     );
-    let _ = (&array_u8).div(3, 2);
+    let _ = (&array_u8).div(3, 2).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).div(3, 2);
+    let _ = (&array_i128).div(3, 2).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_f64).div(3, 2.5);
+    let _ = (&array_f64).div(3, 2.5).spawn();
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    let _ = (&array_custom).div(3, Custom { int: 1, float: 2.5 });
+    let _ = (&array_custom)
+        .div(3, Custom { int: 1, float: 2.5 })
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -619,25 +627,27 @@ fn main() {
         },
         Custom { int: 2, float: 2.5 },
     );
-    let _ = (&array_u8).rem(3, 2);
+    let _ = (&array_u8).rem(3, 2).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).rem(3, 2);
+    let _ = (&array_i128).rem(3, 2).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_f64).rem(3, 2.5);
+    let _ = (&array_f64).rem(3, 2.5).spawn();
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    let _ = (&array_custom).rem(3, Custom { int: 1, float: 2.5 });
+    let _ = (&array_custom)
+        .rem(3, Custom { int: 1, float: 2.5 })
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -650,13 +660,13 @@ fn main() {
     test_and(array_u8.clone(), 255, and_val);
     test_and(array_i128.clone(), 1023, and_val.into());
 
-    let _ = (&array_u8).bit_and(3, 1 << num_pes);
+    let _ = (&array_u8).bit_and(3, 1 << num_pes).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).bit_and(3, 1 << num_pes);
+    let _ = (&array_i128).bit_and(3, 1 << num_pes).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
@@ -667,12 +677,12 @@ fn main() {
     let or_val = 1 << my_pe;
     test_or(array_u8.clone(), 0, or_val);
     test_or(array_i128.clone(), 0, or_val.into());
-    let _ = (&array_u8).bit_or(3, 1 << num_pes);
+    let _ = (&array_u8).bit_or(3, 1 << num_pes).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
-    let _ = (&array_i128).bit_or(3, 1 << num_pes);
+    let _ = (&array_i128).bit_or(3, 1 << num_pes).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
@@ -683,12 +693,12 @@ fn main() {
     let xor_val = 1 << my_pe;
     test_xor(array_u8.clone(), 0, xor_val);
     test_xor(array_i128.clone(), 0, xor_val.into());
-    let _ = (&array_u8).bit_xor(3, 1 << num_pes);
+    let _ = (&array_u8).bit_xor(3, 1 << num_pes).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
-    let _ = (&array_i128).bit_xor(3, 1 << num_pes);
+    let _ = (&array_i128).bit_xor(3, 1 << num_pes).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
@@ -710,31 +720,33 @@ fn main() {
         my_pe,
         num_pes,
     );
-    let _ = (&array_u8).store(3, num_pes as u8);
+    let _ = (&array_u8).store(3, num_pes as u8).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).store(3, num_pes as i128);
+    let _ = (&array_i128).store(3, num_pes as i128).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_f64).store(3, num_pes as f64);
+    let _ = (&array_f64).store(3, num_pes as f64).spawn();
     array_f64.wait_all();
     array_f64.barrier();
     array_f64.print();
     array_f64.barrier();
 
-    let _ = (&array_custom).store(
-        3,
-        Custom {
-            int: num_pes as usize,
-            float: -(num_pes as f32),
-        },
-    );
+    let _ = (&array_custom)
+        .store(
+            3,
+            Custom {
+                int: num_pes as usize,
+                float: -(num_pes as f32),
+            },
+        )
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -755,25 +767,27 @@ fn main() {
             float: 0.0,
         },
     );
-    let _ = (&array_u8).shl(1, 3);
+    let _ = (&array_u8).shl(1, 3).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).shl(1, 63);
+    let _ = (&array_i128).shl(1, 63).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_custom).shl(
-        1,
-        Custom {
-            int: 15,
-            float: 0.0,
-        },
-    );
+    let _ = (&array_custom)
+        .shl(
+            1,
+            Custom {
+                int: 15,
+                float: 0.0,
+            },
+        )
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
@@ -793,25 +807,27 @@ fn main() {
             float: 0.0,
         },
     );
-    let _ = (&array_u8).shr(1, 3);
+    let _ = (&array_u8).shr(1, 3).spawn();
     array_u8.wait_all();
     array_u8.barrier();
     array_u8.print();
     array_u8.barrier();
 
-    let _ = (&array_i128).shr(1, 63);
+    let _ = (&array_i128).shr(1, 63).spawn();
     array_i128.wait_all();
     array_i128.barrier();
     array_i128.print();
     array_i128.barrier();
 
-    let _ = (&array_custom).shr(
-        1,
-        Custom {
-            int: 15,
-            float: 0.0,
-        },
-    );
+    let _ = (&array_custom)
+        .shr(
+            1,
+            Custom {
+                int: 15,
+                float: 0.0,
+            },
+        )
+        .spawn();
     array_custom.wait_all();
     array_custom.barrier();
     array_custom.print();
diff --git a/examples/array_examples/atomic_compare_exchange.rs b/examples/array_examples/atomic_compare_exchange.rs
index 2328347e..e9fa04dd 100644
--- a/examples/array_examples/atomic_compare_exchange.rs
+++ b/examples/array_examples/atomic_compare_exchange.rs
@@ -46,8 +46,7 @@ fn main() {
     array.print();
 
     let array_2 = AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic);
-    array_2.dist_iter_mut().for_each(|x| x.store(0.0)).spawn();
-    array_2.wait_all();
+    array_2.dist_iter_mut().for_each(|x| x.store(0.0)).block();
     array_2.barrier();
 
     let mut rng = rand::rngs::StdRng::seed_from_u64(my_pe as u64);
diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index 18d900fe..bbcf5235 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -150,7 +150,7 @@ fn main() {
             .block()
     };
     let block_array = block_array.into_read_only();
-    let _ = block_array.blocking_sum();
+    let _ = block_array.sum().block();
 
     let one_elem_array = UnsafeArray::<usize>::new(world.team(), 1, Distribution::Block);
     let min = unsafe { one_elem_array.min() };
diff --git a/examples/bandwidths/am_bw.rs b/examples/bandwidths/am_bw.rs
index c45296af..880a55a6 100644
--- a/examples/bandwidths/am_bw.rs
+++ b/examples/bandwidths/am_bw.rs
@@ -58,7 +58,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let d = _data.clone();
                 sub_time += sub_timer.elapsed().as_secs_f64();
-                let _ = world.exec_am_pe(num_pes - 1, DataAM { data: d }); //we explicity  captured _data and transfer it even though we do nothing with it
+                let _ = world.exec_am_pe(num_pes - 1, DataAM { data: d }).spawn(); //we explicity  captured _data and transfer it even though we do nothing with it
 
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/am_bw_get.rs b/examples/bandwidths/am_bw_get.rs
index b558c414..aa26857e 100644
--- a/examples/bandwidths/am_bw_get.rs
+++ b/examples/bandwidths/am_bw_get.rs
@@ -76,14 +76,12 @@ fn main() {
         if my_pe == num_pes - 1 {
             for _j in (0..(2_u64.pow(exp))).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
-                let _ = world.exec_am_pe(
-                    0,
+                let _ = world.exec_am_pe(0,
                     DataAM {
                         array: array.clone(),
                         index: 0 as usize,
                         length: num_bytes as usize,
-                    },
-                );
+                    },).spawn();
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/atomic_array_get_bw.rs b/examples/bandwidths/atomic_array_get_bw.rs
index 2515ae78..3484f426 100644
--- a/examples/bandwidths/atomic_array_get_bw.rs
+++ b/examples/bandwidths/atomic_array_get_bw.rs
@@ -57,7 +57,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg).spawn();
                 }
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
diff --git a/examples/bandwidths/atomic_array_put_bw.rs b/examples/bandwidths/atomic_array_put_bw.rs
index 26604649..f30a2f88 100644
--- a/examples/bandwidths/atomic_array_put_bw.rs
+++ b/examples/bandwidths/atomic_array_put_bw.rs
@@ -54,7 +54,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg) };
+                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg).spawn() };
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
                 sub_time += sub_timer.elapsed().as_secs_f64();
diff --git a/examples/bandwidths/global_lock_atomic_array_get_bw.rs b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
index ca3110ee..029f9d97 100644
--- a/examples/bandwidths/global_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
@@ -59,7 +59,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg).spawn();
                 }
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index f3ecf0d4..919521f2 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -55,7 +55,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                let _ = unsafe { array.put(j, sub_reg) };
+                let _ = unsafe { array.put(j, sub_reg).spawn() };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/local_lock_atomic_array_get_bw.rs b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
index fae3fff5..d836f6a2 100644
--- a/examples/bandwidths/local_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
@@ -59,7 +59,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg).spawn();
                 }
                 // println!("j: {:?}",j);
                 // unsafe { array.put_slice(num_pes - 1, j, &data[..num_bytes as usize]) };
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 69994ec9..673aa22a 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -55,7 +55,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg) };
+                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, sub_reg).spawn() };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/readonly_array_get_bw.rs b/examples/bandwidths/readonly_array_get_bw.rs
index bc12bf1c..f918a37d 100644
--- a/examples/bandwidths/readonly_array_get_bw.rs
+++ b/examples/bandwidths/readonly_array_get_bw.rs
@@ -57,7 +57,7 @@ fn main() {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
                 unsafe {
-                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg).spawn();
                 }
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
diff --git a/examples/bandwidths/task_group_am_bw.rs b/examples/bandwidths/task_group_am_bw.rs
index 41158369..0d8d5454 100644
--- a/examples/bandwidths/task_group_am_bw.rs
+++ b/examples/bandwidths/task_group_am_bw.rs
@@ -57,7 +57,9 @@ fn main() {
                 let sub_timer = Instant::now();
                 let d = _data.clone();
                 sub_time += sub_timer.elapsed().as_secs_f64();
-                let _ = task_group.exec_am_pe(num_pes - 1, DataAM { data: d }); //we explicity  captured _data and transfer it even though we do nothing with it
+                let _ = task_group
+                    .exec_am_pe(num_pes - 1, DataAM { data: d })
+                    .spawn(); //we explicity  captured _data and transfer it even though we do nothing with it
 
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/unsafe_array_get_bw.rs b/examples/bandwidths/unsafe_array_get_bw.rs
index a903191b..5f379915 100644
--- a/examples/bandwidths/unsafe_array_get_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_bw.rs
@@ -57,7 +57,7 @@ fn main() {
                 let sub_reg = data.sub_region(j..(j + num_bytes as usize));
 
                 unsafe {
-                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
+                    let _ = array.get(ARRAY_LEN * (num_pes - 1), &sub_reg).spawn();
                 }
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
diff --git a/examples/bandwidths/unsafe_array_put_bw.rs b/examples/bandwidths/unsafe_array_put_bw.rs
index 3463ab4f..80354082 100644
--- a/examples/bandwidths/unsafe_array_put_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_bw.rs
@@ -52,7 +52,7 @@ fn main() {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
                 let sub_reg = data.sub_region(..num_bytes as usize);
-                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, &sub_reg) };
+                let _ = unsafe { array.put(ARRAY_LEN * (num_pes - 1) + j, &sub_reg).spawn() };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/unsafe_array_store_bw.rs b/examples/bandwidths/unsafe_array_store_bw.rs
index 9a08182f..0a8c007f 100644
--- a/examples/bandwidths/unsafe_array_store_bw.rs
+++ b/examples/bandwidths/unsafe_array_store_bw.rs
@@ -58,7 +58,7 @@ fn main() {
 
                 // array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
                 let _ = unsafe {
-                    array.batch_store(ARRAY_LEN * (num_pes - 1), sub_reg.as_slice().unwrap())
+                    array.batch_store(ARRAY_LEN * (num_pes - 1), sub_reg.as_slice().unwrap()).spawn()
                 };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index 334f2c7a..725e1b8a 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -104,8 +104,8 @@ fn main() {
                 my_arc: Darc::new(team.clone(), Arc::new(0)).unwrap(),
             };
             println!("here 7");
-            let _ = team.exec_am_pe(0, darc_am.clone());
-            let _ = team.exec_am_all(darc_am.clone());
+            let _ = team.exec_am_pe(0, darc_am.clone()).spawn();
+            let _ = team.exec_am_all(darc_am.clone()).spawn();
             tg.add_am_pe(0, darc_am.clone());
             tg.add_am_all(darc_am);
             team.block_on(tg.exec());
diff --git a/examples/darc_examples/stress_test.rs b/examples/darc_examples/stress_test.rs
index 1c2d153c..44c17bc5 100644
--- a/examples/darc_examples/stress_test.rs
+++ b/examples/darc_examples/stress_test.rs
@@ -27,15 +27,17 @@ impl LamellarAM for DataAM {
             for _i in 0..self.width {
                 let pe = pes.sample(&mut rng);
                 // println!("sending {:?} to {:?}",path,pe);
-                let _ = lamellar::team.exec_am_pe(
-                    pe,
-                    DataAM {
-                        darc: self.darc.clone(),
-                        depth: self.depth - 1,
-                        width: self.width,
-                        path: path.clone(),
-                    },
-                );
+                let _ = lamellar::team
+                    .exec_am_pe(
+                        pe,
+                        DataAM {
+                            darc: self.darc.clone(),
+                            depth: self.depth - 1,
+                            width: self.width,
+                            path: path.clone(),
+                        },
+                    )
+                    .spawn();
             }
         }
     }
@@ -84,24 +86,28 @@ fn main() {
     let width = 5;
     for _i in 0..width {
         let pe = pes.sample(&mut rng) / 2; //since both teams consist of half the number of pes as the world
-        let _ = first_half_team.exec_am_pe(
-            pe,
-            DataAM {
-                darc: darc.clone(),
-                depth: 5,
-                width: width,
-                path: vec![my_pe],
-            },
-        );
-        let _ = odd_team.exec_am_pe(
-            pe,
-            DataAM {
-                darc: darc.clone(),
-                depth: 5,
-                width: width,
-                path: vec![my_pe],
-            },
-        );
+        let _ = first_half_team
+            .exec_am_pe(
+                pe,
+                DataAM {
+                    darc: darc.clone(),
+                    depth: 5,
+                    width: width,
+                    path: vec![my_pe],
+                },
+            )
+            .spawn();
+        let _ = odd_team
+            .exec_am_pe(
+                pe,
+                DataAM {
+                    darc: darc.clone(),
+                    depth: 5,
+                    width: width,
+                    path: vec![my_pe],
+                },
+            )
+            .spawn();
     }
     world.wait_all();
     world.barrier();
diff --git a/examples/hello_world/hello_world_array.rs b/examples/hello_world/hello_world_array.rs
index 98428761..f483fbd8 100644
--- a/examples/hello_world/hello_world_array.rs
+++ b/examples/hello_world/hello_world_array.rs
@@ -36,7 +36,7 @@ fn main() {
     let timer = std::time::Instant::now();
     //add 1 to each element of array
     // for i in 0..global_length {
-    let _ = array.batch_add(0, &local_vec[0..100]);
+    let _ = array.batch_add(0, &local_vec[0..100]).spawn();
     // }
     //wait for all the local add operations to finish
     array.wait_all();
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index d766dc4c..e6b56040 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -148,24 +148,28 @@ fn dft_lamellar(
     let timer = Instant::now();
     for pe in 0..num_pes {
         for k in 0..spectrum_slice.len() {
-            let _ = world.exec_am_local(LocalSumAM {
-                spectrum: add_spec.clone(),
-                signal: signal.clone(),
-                global_sig_len: global_sig_len,
-                k: k,
-                pe: pe,
-            });
+            let _ = world
+                .exec_am_local(LocalSumAM {
+                    spectrum: add_spec.clone(),
+                    signal: signal.clone(),
+                    global_sig_len: global_sig_len,
+                    k: k,
+                    pe: pe,
+                })
+                .spawn();
         }
         let mut add_spec_vec = vec![0.0; spectrum_slice.len()];
         world.wait_all();
         add_spec_vec.copy_from_slice(unsafe { add_spec.as_slice().unwrap() });
-        let _ = world.exec_am_pe(
-            pe,
-            RemoteSumAM {
-                spectrum: spectrum.clone(),
-                add_spec: add_spec_vec,
-            },
-        );
+        let _ = world
+            .exec_am_pe(
+                pe,
+                RemoteSumAM {
+                    spectrum: spectrum.clone(),
+                    add_spec: add_spec_vec,
+                },
+            )
+            .spawn();
         world.wait_all();
     }
     world.wait_all();
@@ -396,7 +400,7 @@ fn dft_lamellar_array_swapped(signal: UnsafeArray<f64>, spectrum: UnsafeArray<f6
     unsafe {
         for (i, x) in signal.onesided_iter().into_iter().enumerate() {
             let x = (*x).clone();
-            spectrum
+            let _ = spectrum
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(k, spec_bin)| {
@@ -434,7 +438,7 @@ fn dft_lamellar_array_opt(
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
 
-                spectrum
+                let _ = spectrum
                     .dist_iter_mut()
                     .enumerate()
                     .for_each(move |(k, spec_bin)| {
@@ -477,7 +481,7 @@ fn dft_lamellar_array_opt_test(
             .enumerate()
             .for_each(|(i, chunk)| {
                 let signal = chunk.clone();
-                spectrum
+                let _ = spectrum
                     .dist_iter_mut()
                     .enumerate()
                     .for_each_with_schedule(Schedule::Dynamic, move |(k, spec_bin)| {
@@ -521,7 +525,7 @@ fn dft_lamellar_array_opt_2(
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
 
-            spectrum
+            let _ = spectrum
                 .dist_iter_mut()
                 .enumerate()
                 .for_each(move |(k, mut spec_bin)| {
@@ -564,7 +568,7 @@ fn dft_lamellar_array_opt_3(
         .for_each(|(i, chunk)| {
             let signal = chunk.clone();
 
-            spectrum
+            let _ = spectrum
                 .dist_iter_mut() //this locks the LocalLockArray
                 .enumerate()
                 .for_each(move |(k, spec_bin)| {
diff --git a/examples/kernels/parallel_array_gemm.rs b/examples/kernels/parallel_array_gemm.rs
index 25f8c374..2ec9b60c 100644
--- a/examples/kernels/parallel_array_gemm.rs
+++ b/examples/kernels/parallel_array_gemm.rs
@@ -71,7 +71,7 @@ fn main() {
         .for_each(|(j, col)| {
             let col = col.clone();
             let c = c.clone();
-            a
+            let _ = a
                 // .local_iter() //LocalIterator (each pe will iterate through only its local data -- in parallel)
                 // .chunks(n) // chunk by the row size
                 .local_chunks(n)
diff --git a/examples/kernels/serial_array_gemm.rs b/examples/kernels/serial_array_gemm.rs
index 1ecbfd4f..fd669ef4 100644
--- a/examples/kernels/serial_array_gemm.rs
+++ b/examples/kernels/serial_array_gemm.rs
@@ -76,7 +76,7 @@ fn main() {
                         let b_val = b_c.at(j + k * n);
                         sum += a_val.await * b_val.await;
                     }
-                    let _ = c_c.store(j + i * m, sum); // could also do c.add(j+i*m,sum), but each element of c will only be updated once so store is slightly faster
+                    let _ = c_c.store(j + i * m, sum).spawn(); // could also do c.add(j+i*m,sum), but each element of c will only be updated once so store is slightly faster
                 }
             }
         });
diff --git a/examples/misc/ping_pong.rs b/examples/misc/ping_pong.rs
index 34b70892..5a991406 100644
--- a/examples/misc/ping_pong.rs
+++ b/examples/misc/ping_pong.rs
@@ -294,13 +294,15 @@ impl LamellarAm for MyAm {
                 //     buffer_size: self.buffer_size,
                 //     comm_lock: self.comm_lock.clone(),
                 // }));
-                task_group.exec_am_local(SendAm {
-                    indices,
-                    buffers: self.buffers.clone(),
-                    remote_pe: pe,
-                    buffer_size: self.buffer_size,
-                    comm_lock: self.comm_lock.clone(),
-                });
+                let _ = task_group
+                    .exec_am_local(SendAm {
+                        indices,
+                        buffers: self.buffers.clone(),
+                        remote_pe: pe,
+                        buffer_size: self.buffer_size,
+                        comm_lock: self.comm_lock.clone(),
+                    })
+                    .spawn();
                 cnt += 1;
                 // }
             }
@@ -394,12 +396,14 @@ fn main() {
     world.barrier();
     let timer = std::time::Instant::now();
     for (pe, buffer) in res_am_buffers.iter().enumerate() {
-        world.exec_am_local(RecvAm {
-            buffer: buffer.clone(),
-            remote_pe: pe,
-            finished: finished.clone(),
-            buffer_size,
-        });
+        let _ = world
+            .exec_am_local(RecvAm {
+                buffer: buffer.clone(),
+                remote_pe: pe,
+                finished: finished.clone(),
+                buffer_size,
+            })
+            .spawn();
     }
     let mut reqs = vec![];
     // if my_pe == 0 {
@@ -413,9 +417,7 @@ fn main() {
             comm_lock: comm_lock.clone(),
         }));
     }
-    for req in reqs {
-        world.block_on(req);
-    }
+    world.block_on_all(reqs);
     // }
     world.barrier();
     println!(
diff --git a/examples/rdma_examples/rdma_am.rs b/examples/rdma_examples/rdma_am.rs
index 6f36f619..5747822a 100644
--- a/examples/rdma_examples/rdma_am.rs
+++ b/examples/rdma_examples/rdma_am.rs
@@ -144,11 +144,13 @@ fn main() {
     world.barrier();
     let mut index = 0;
     while index * num_pes < ARRAY_LEN {
-        let _ = world.exec_am_all(RdmaLocalMRAM {
-            array: local_array.clone(),
-            orig_pe: my_pe,
-            index: index,
-        });
+        let _ = world
+            .exec_am_all(RdmaLocalMRAM {
+                array: local_array.clone(),
+                orig_pe: my_pe,
+                index: index,
+            })
+            .spawn();
         index += 1;
     }
 
diff --git a/examples/team_examples/custom_team_arch.rs b/examples/team_examples/custom_team_arch.rs
index 74957329..811124c4 100644
--- a/examples/team_examples/custom_team_arch.rs
+++ b/examples/team_examples/custom_team_arch.rs
@@ -135,7 +135,7 @@ fn test_team(world: &LamellarWorld, team: Option<Arc<LamellarTeam>>, label: &str
             1
         };
         let timer = Instant::now();
-        let _ = team.exec_am_all(TeamAM { secs }); //everynode that has a handle can launch on a given team;
+        let _ = team.exec_am_all(TeamAM { secs }).spawn(); //everynode that has a handle can launch on a given team;
         team.wait_all(); //wait until all requests return
         team.barrier(); // barriers only apply to team members, its a no op for non team members
         timer.elapsed().as_secs_f64()
@@ -169,7 +169,7 @@ fn main() {
     }
     world.barrier();
     let timer = Instant::now();
-    let _ = world.exec_am_all(TeamAM { secs: 1 });
+    let _ = world.exec_am_all(TeamAM { secs: 1 }).spawn();
     world.wait_all();
     world.barrier();
     let elapsed = timer.elapsed().as_secs_f64();
diff --git a/examples/team_examples/random_team.rs b/examples/team_examples/random_team.rs
index 645f11df..e8b8be93 100644
--- a/examples/team_examples/random_team.rs
+++ b/examples/team_examples/random_team.rs
@@ -177,7 +177,7 @@ fn main() {
                 team_pe: t,
             };
             println!("launching {:?} to pe {:?}", d, i);
-            let _ = team.exec_am_pe(i, d);
+            let _ = team.exec_am_pe(i, d).spawn();
         }
 
         let p = rand_arch.team_id(my_pe);
diff --git a/examples/team_examples/team_am.rs b/examples/team_examples/team_am.rs
index 5bd3bde5..ff5ea72e 100644
--- a/examples/team_examples/team_am.rs
+++ b/examples/team_examples/team_am.rs
@@ -38,10 +38,12 @@ fn test_team(world: &LamellarWorld, team: Option<Arc<LamellarTeam>>, label: &str
             1
         };
         let timer = Instant::now();
-        let _ = team.exec_am_all(TeamAM {
-            secs: secs,
-            orig_pe: my_pe,
-        }); //everynode that has a handle can launch on a given team;
+        let _ = team
+            .exec_am_all(TeamAM {
+                secs: secs,
+                orig_pe: my_pe,
+            })
+            .spawn(); //everynode that has a handle can launch on a given team;
         world.wait_all();
         println!("after world wait_all {:?}", timer.elapsed().as_secs_f64());
         team.wait_all(); //wait until all requests return
@@ -76,10 +78,12 @@ fn main() {
     }
     world.barrier();
     let timer = Instant::now();
-    let _ = world.exec_am_all(TeamAM {
-        secs: 1,
-        orig_pe: my_pe,
-    });
+    let _ = world
+        .exec_am_all(TeamAM {
+            secs: 1,
+            orig_pe: my_pe,
+        })
+        .spawn();
     world.wait_all();
     world.barrier();
     let elapsed = timer.elapsed().as_secs_f64();
diff --git a/impl/src/gen_am.rs b/impl/src/gen_am.rs
index 088aef1a..25baf8aa 100644
--- a/impl/src/gen_am.rs
+++ b/impl/src/gen_am.rs
@@ -285,22 +285,25 @@ fn gen_return_stmt(
         AmType::ReturnData(ref ret) => {
             let last_expr = get_expr(&last_stmt)
                 .expect("failed to get exec return value (try removing the last \";\")");
+            let last_expr =
+                quote_spanned! {last_stmt.span()=> let __lamellar_last_expr: #ret = #last_expr; };
             let remote_last_expr = match ret {
                 syn::Type::Array(a) => match &*a.elem {
                     syn::Type::Path(type_path)
                         if type_path.clone().into_token_stream().to_string() == "u8" =>
                     {
                         byte_buf = true;
-                        quote_spanned! {last_stmt.span()=> ByteBuf::from(#last_expr)}
+                        quote_spanned! {last_stmt.span()=> ByteBuf::from(__lamellar_last_expr)}
                     }
-                    _ => quote_spanned! {last_stmt.span()=> #last_expr},
+                    _ => quote_spanned! {last_stmt.span()=> __lamellar_last_expr},
                 },
-                _ => quote_spanned! {last_stmt.span()=> #last_expr},
+                _ => quote_spanned! {last_stmt.span()=> __lamellar_last_expr},
             };
             if !local {
                 quote_spanned! {last_stmt.span()=>
+                    #last_expr
                     let ret = match __local{ //should probably just separate these into exec_local exec_remote to get rid of a conditional...
-                        true => #lamellar::active_messaging::LamellarReturn::LocalData(Box::new(#last_expr)),
+                        true => #lamellar::active_messaging::LamellarReturn::LocalData(Box::new(__lamellar_last_expr)),
                         false => #lamellar::active_messaging::LamellarReturn::RemoteData(std::sync::Arc::new (#ret_struct_name{
                             val: #remote_last_expr,
                         })),
@@ -309,24 +312,29 @@ fn gen_return_stmt(
                 }
             } else {
                 quote_spanned! {last_stmt.span()=>
-                    #lamellar::active_messaging::LamellarReturn::LocalData(Box::new(#last_expr))
+                    #last_expr
+                    #lamellar::active_messaging::LamellarReturn::LocalData(Box::new(__lamellar_last_expr))
                 }
             }
         }
-        AmType::ReturnAm(_, _) => {
+        AmType::ReturnAm(ret, _) => {
             let last_expr = get_expr(&last_stmt)
                 .expect("failed to get exec return value (try removing the last \";\")");
+            let last_expr =
+                quote_spanned! {last_stmt.span()=> let __lamellar_last_expr: #ret = #last_expr; };
             if !local {
                 quote_spanned! {last_stmt.span()=>
+                    #last_expr
                     let ret = match __local{
-                        true => #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new(#last_expr)),
-                        false => #lamellar::active_messaging::LamellarReturn::RemoteAm(std::sync::Arc::new(#last_expr)),
+                        true => #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new(__lamellar_last_expr)),
+                        false => #lamellar::active_messaging::LamellarReturn::RemoteAm(std::sync::Arc::new(__lamellar_last_expr)),
                     };
                     ret
                 }
             } else {
                 quote_spanned! {last_stmt.span()=>
-                    #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new(#last_expr))
+                    #last_expr
+                    #lamellar::active_messaging::LamellarReturn::LocalAm(std::sync::Arc::new(__lamellar_last_expr))
                 }
             }
         }
diff --git a/run_examples.sh b/run_examples.sh
index 881b440f..95268d48 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -18,7 +18,7 @@ mkdir -p ${results_dir}
 ln -s ${output_dir}/rofiverbs_lamellae rofiverbs_lamellae
 
 
-cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
+# cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
 
 
 cd rofiverbs_lamellae/${local_results_dir}
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 6f012acc..3fcb12fe 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -961,8 +961,8 @@ pub trait ActiveMessaging {
     /// Returns a future allow the user to poll for complete and retrive the result of the Active Message stored within a vector,
     /// each index in the vector corresponds to the data returned by the corresponding PE
     ///
-    /// NOTE: lamellar active messages are not lazy, i.e. you do not need to drive the returned future to launch the computation,
-    /// the future is only used to check for completion and/or retrieving any returned data
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][MultiAmHandle::spawn] or [blocked on][MultiAmHandle::block]
     ///
     /// # One-sided Operation
     /// The calling PE manages creating and transfering the active message to the remote PEs (without user intervention on the remote PEs).
@@ -995,6 +995,7 @@ pub trait ActiveMessaging {
     ///     assert_eq!(i,results[i]);
     /// }
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn exec_am_all<F>(&self, am: F) -> Self::MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist;
@@ -1007,8 +1008,8 @@ pub trait ActiveMessaging {
     /// Returns a future allow the user to poll for complete and retrive the result of the Active Message
     ///
     ///
-    /// NOTE: lamellar active messages are not lazy, i.e. you do not need to drive the returned future to launch the computation,
-    /// the future is only used to check for completeion and/or retrieving any returned data
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     ///
     /// # One-sided Operation
     /// The calling PE manages creating and transfering the active message to the remote PE (without user intervention on the remote PE).
@@ -1039,6 +1040,7 @@ pub trait ActiveMessaging {
     /// let result = world.block_on(request); //block until am has executed
     /// assert_eq!(world.num_pes()-1,result);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn exec_am_pe<F>(&self, pe: usize, am: F) -> Self::SinglePeAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist;
@@ -1051,8 +1053,8 @@ pub trait ActiveMessaging {
     /// Returns a future allow the user to poll for complete and retrive the result of the Active Message.
     ///
     ///
-    /// NOTE: lamellar active messages are not lazy, i.e. you do not need to drive the returned future to launch the computation,
-    /// the future is only used to check for completeion and/or retrieving any returned data.
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalAmHandle::spawn] or [blocked on][LocalAmHandle::block]
     ///
     /// # One-sided Operation
     /// The calling PE manages creating and executing the active message local (remote PEs are not involved).
@@ -1086,6 +1088,7 @@ pub trait ActiveMessaging {
     /// let result = world.block_on(request); //block until am has executed
     /// assert_eq!(world.my_pe(),result);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn exec_am_local<F>(&self, am: F) -> Self::LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static;
@@ -1158,6 +1161,7 @@ pub trait ActiveMessaging {
     ///     world_clone.await_all().await; //block until the previous am has finished
     /// });
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited"]
     fn await_all(&self) -> impl Future<Output = ()> + Send;
 
     #[doc(alias = "Collective")]
@@ -1196,6 +1200,7 @@ pub trait ActiveMessaging {
     ///     world_clone.async_barrier().await; //block until all PEs have entered the barrier
     /// });
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited."]
     fn async_barrier(&self) -> BarrierHandle;
 
     #[doc(alias("One-sided", "onesided"))]
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index 55991064..fafbf33e 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -17,7 +17,7 @@ use crate::{
     lamellae::Des,
     lamellar_request::{InternalResult, LamellarRequest, LamellarRequestAddResult},
     memregion::one_sided::MemRegionHandleInner,
-    scheduler::Scheduler,
+    scheduler::{LamellarTask, Scheduler},
     Darc, LamellarArchRT,
 };
 
@@ -122,6 +122,19 @@ impl<T: AmDist> AmHandle<T> {
             }
         }
     }
+
+    /// This method will spawn the associated Active Message on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.inner.scheduler.clone().spawn_task(self)
+    }
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> T {
+        self.inner.scheduler.clone().block_on(self)
+    }
 }
 
 impl<T: AmDist> LamellarRequest for AmHandle<T> {
@@ -211,6 +224,21 @@ impl<T: 'static> LocalAmHandle<T> {
     }
 }
 
+impl<T: Send + 'static> LocalAmHandle<T> {
+    /// This method will spawn the associated Active Message on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.inner.scheduler.clone().spawn_task(self)
+    }
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> T {
+        self.inner.scheduler.clone().block_on(self)
+    }
+}
+
 impl<T: AmDist> From<LocalAmHandle<T>> for AmHandle<T> {
     fn from(x: LocalAmHandle<T>) -> Self {
         x.inner.user_handle.fetch_add(1, Ordering::SeqCst);
@@ -364,6 +392,18 @@ impl<T: AmDist> MultiAmHandle<T> {
             }
         }
     }
+    /// This method will spawn the associated Active Message on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<Vec<T>> {
+        self.inner.scheduler.clone().spawn_task(self)
+    }
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> Vec<T> {
+        self.inner.scheduler.clone().block_on(self)
+    }
 }
 
 impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
diff --git a/src/array.rs b/src/array.rs
index e6eba6de..35b19562 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -641,6 +641,18 @@ impl LamellarByteArray {
             LamellarByteArray::GlobalLockArray(_) => std::any::TypeId::of::<GlobalLockByteArray>(),
         }
     }
+
+    pub(crate) fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+        match self {
+            LamellarByteArray::UnsafeArray(array) => array.inner.data.team(),
+            LamellarByteArray::ReadOnlyArray(array) => array.array.inner.data.team(),
+            LamellarByteArray::AtomicArray(array) => array.team(),
+            LamellarByteArray::NativeAtomicArray(array) => array.array.inner.data.team(),
+            LamellarByteArray::GenericAtomicArray(array) => array.array.inner.data.team(),
+            LamellarByteArray::LocalLockArray(array) => array.array.inner.data.team(),
+            LamellarByteArray::GlobalLockArray(array) => array.array.inner.data.team(),
+        }
+    }
 }
 
 impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarReadArray<T> {
@@ -922,116 +934,6 @@ impl<T: Dist> ActiveMessaging for LamellarWriteArray<T> {
     }
 }
 
-// impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarReadArray<T> {
-//     fn reduce(&self, reduction: &str) -> AmHandle<T> {
-//         match self {
-//             LamellarReadArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
-//             LamellarReadArray::AtomicArray(array) => array.reduce(reduction),
-//             LamellarReadArray::LocalLockArray(array) => array.blocking_reduce(reduction),
-//             LamellarReadArray::GlobalLockArray(array) => array.reduce(reduction),
-//             LamellarReadArray::ReadOnlyArray(array) => array.reduce(reduction),
-//         }
-//     }
-// }
-
-// impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-//     for LamellarReadArray<T>
-// {
-//     fn sum(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarReadArray::UnsafeArray(array) => unsafe { array.sum() },
-//             LamellarReadArray::AtomicArray(array) => array.sum(),
-//             LamellarReadArray::LocalLockArray(array) => array.sum(),
-//             LamellarReadArray::GlobalLockArray(array) => array.sum(),
-//             LamellarReadArray::ReadOnlyArray(array) => array.sum(),
-//         }
-//     }
-//     fn prod(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarReadArray::UnsafeArray(array) => unsafe { array.prod() },
-//             LamellarReadArray::AtomicArray(array) => array.prod(),
-//             LamellarReadArray::LocalLockArray(array) => array.prod(),
-//             LamellarReadArray::GlobalLockArray(array) => array.prod(),
-//             LamellarReadArray::ReadOnlyArray(array) => array.prod(),
-//         }
-//     }
-// }
-
-// impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-//     for LamellarReadArray<T>
-// {
-//     fn max(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarReadArray::UnsafeArray(array) => unsafe { array.max() },
-//             LamellarReadArray::AtomicArray(array) => array.max(),
-//             LamellarReadArray::LocalLockArray(array) => array.max(),
-//             LamellarReadArray::GlobalLockArray(array) => array.max(),
-//             LamellarReadArray::ReadOnlyArray(array) => array.max(),
-//         }
-//     }
-//     fn min(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarReadArray::UnsafeArray(array) => unsafe { array.min() },
-//             LamellarReadArray::AtomicArray(array) => array.min(),
-//             LamellarReadArray::LocalLockArray(array) => array.min(),
-//             LamellarReadArray::GlobalLockArray(array) => array.min(),
-//             LamellarReadArray::ReadOnlyArray(array) => array.min(),
-//         }
-//     }
-// }
-
-// impl<T: Dist + AmDist + 'static> LamellarArrayReduce<T> for LamellarWriteArray<T> {
-//     fn reduce(&self, reduction: &str) -> AmHandle<T> {
-//         match self {
-//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.reduce(reduction) },
-//             LamellarWriteArray::AtomicArray(array) => array.reduce(reduction),
-//             LamellarWriteArray::LocalLockArray(array) => array.reduce(reduction),
-//             LamellarWriteArray::GlobalLockArray(array) => array.reduce(reduction),
-//         }
-//     }
-// }
-// impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LamellarArrayArithmeticReduce<T>
-//     for LamellarWriteArray<T>
-// {
-//     fn sum(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.sum() },
-//             LamellarWriteArray::AtomicArray(array) => array.sum(),
-//             LamellarWriteArray::LocalLockArray(array) => array.sum(),
-//             LamellarWriteArray::GlobalLockArray(array) => array.sum(),
-//         }
-//     }
-//     fn prod(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.prod() },
-//             LamellarWriteArray::AtomicArray(array) => array.prod(),
-//             LamellarWriteArray::LocalLockArray(array) => array.prod(),
-//             LamellarWriteArray::GlobalLockArray(array) => array.prod(),
-//         }
-//     }
-// }
-
-// impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LamellarArrayCompareReduce<T>
-//     for LamellarWriteArray<T>
-// {
-//     fn max(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.max() },
-//             LamellarWriteArray::AtomicArray(array) => array.max(),
-//             LamellarWriteArray::LocalLockArray(array) => array.max(),
-//             LamellarWriteArray::GlobalLockArray(array) => array.max(),
-//         }
-//     }
-//     fn min(&self) -> AmHandle<T> {
-//         match self {
-//             LamellarWriteArray::UnsafeArray(array) => unsafe { array.min() },
-//             LamellarWriteArray::AtomicArray(array) => array.min(),
-//             LamellarWriteArray::LocalLockArray(array) => array.min(),
-//             LamellarWriteArray::GlobalLockArray(array) => array.min(),
-//         }
-//     }
-// }
-
 // private sealed trait
 #[doc(hidden)]
 pub trait InnerArray: Sized {
@@ -1478,7 +1380,8 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     ///
     /// # One-sided Operation
     /// the remote transfer is initiated by the calling PE
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1517,6 +1420,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     /// PE3: buf data [12,12,12,12,12,12,12,12,12,12,12,12]
     /// PE0: buf data [0,1,2,3,4,5,6,7,8,9,10,11] //we only did the "get" on PE0, also likely to be printed last since the other PEs do not wait for PE0 in this example
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     unsafe fn get<U: TeamTryInto<LamellarArrayRdmaOutput<T>> + LamellarWrite>(
         &self,
         index: usize,
@@ -1538,7 +1442,8 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     ///
     /// # One-sided Operation
     /// the remote transfer is initiated by the calling PE
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][ArrayRdmaHandle::spawn] or [blocked on][ArrayRdmaHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1569,6 +1474,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     /// PE2: array[9] = 3
     /// PE3: array[0] = 0
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     fn at(&self, index: usize) -> ArrayRdmaAtHandle<T>;
 }
 
@@ -1614,7 +1520,8 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     ///
     /// # One-sided Operation
     /// the remote transfer is initiated by the calling PE
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][ArrayRdmaHandle::spawn] or [blocked on][ArrayRdmaHandle::block]
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
@@ -1661,6 +1568,7 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     /// PE2: array data [6,7,8]
     /// PE3: array data [9,10,11]
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     unsafe fn put<U: TeamTryInto<LamellarArrayRdmaInput<T>> + LamellarRead>(
         &self,
         index: usize,
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index ddd4f92b..f2a8fa4b 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -9,7 +9,6 @@ use crate::array::iterator::local_iterator::LocalIteratorLauncher;
 use crate::array::native_atomic::NativeAtomicElement;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
-use crate::config;
 // use crate::darc::{Darc, DarcMode};
 use crate::barrier::BarrierHandle;
 use crate::lamellar_team::IntoLamellarTeam;
@@ -670,6 +669,12 @@ impl AtomicByteArray {
             }
         }
     }
+    pub(crate) fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+        match self {
+            AtomicByteArray::NativeAtomicByteArray(array) => array.array.inner.data.team(),
+            AtomicByteArray::GenericAtomicByteArray(array) => array.array.inner.data.team(),
+        }
+    }
 }
 
 impl crate::active_messaging::DarcSerde for AtomicByteArray {
@@ -1285,7 +1290,8 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
     /// but performing a reduction could result in safe but non deterministic results.
     ///
     /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1302,67 +1308,13 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn reduce(&self, reduction: &str) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
             AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
         }
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// Please see the documentation for the [register_reduction] procedural macro for
-    /// more details and examples on how to create your own reductions.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
-    ///
-    ///  # Safety
-    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
-    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
-    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
-    /// it may not be your desired behavior.
-    ///
-    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
-    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
-    /// but performing a reduction could result in safe but non deterministic results.
-    ///
-    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
-    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
-    /// let sum = array.blocking_reduce("sum"); // equivalent to calling array.sum()
-    /// assert_eq!(array.len()*num_pes,sum);
-    ///```
-    pub fn blocking_reduce(&self, reduction: &str) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        self.block_on(match self {
-            AtomicArray::NativeAtomicArray(array) => array.reduce(reduction),
-            AtomicArray::GenericAtomicArray(array) => array.reduce(reduction),
-        })
-    }
 }
 
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
@@ -1386,7 +1338,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// but performing a reduction could result in safe but non deterministic results.
     ///
     /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1402,6 +1355,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// let sum = array.block_on(array.sum());
     /// assert_eq!(array.len()*num_pes,sum);
     /// ```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn sum(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.sum(),
@@ -1409,59 +1363,6 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
         }
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("sum")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
-    /// the returned sum reduction result is only available on the calling PE
-    ///
-    ///  # Safety
-    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
-    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
-    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
-    /// it may not be your desired behavior.
-    ///
-    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
-    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
-    /// but performing a reduction could result in safe but non deterministic results.
-    ///
-    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
-    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
-    /// let sum = array.blocking_sum();
-    /// assert_eq!(array.len()*num_pes,sum);
-    /// ```
-    pub fn blocking_sum(&self) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_sum` from within an async context which may lead to deadlock, it is recommended that you use `sum().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        self.block_on(match self {
-            AtomicArray::NativeAtomicArray(array) => array.sum(),
-            AtomicArray::GenericAtomicArray(array) => array.sum(),
-        })
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1482,7 +1383,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// but performing a reduction could result in safe but non deterministic results.
     ///
     /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1497,64 +1399,13 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn prod(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.prod(),
             AtomicArray::GenericAtomicArray(array) => array.prod(),
         }
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("prod")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
-    /// the returned prod reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
-    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
-    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
-    /// it may not be your desired behavior.
-    ///
-    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
-    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
-    /// but performing a reduction could result in safe but non deterministic results.
-    ///
-    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
-    ///     elem.store(i+1);
-    /// });
-    /// array.wait_all();
-    /// array.barrier();
-    /// let prod =  array.blocking_prod();
-    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
-    ///```
-    pub fn blocking_prod(&self) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_prod` from within an async context which may lead to deadlock, it is recommended that you use `prod().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        self.block_on(match self {
-            AtomicArray::NativeAtomicArray(array) => array.prod(),
-            AtomicArray::GenericAtomicArray(array) => array.prod(),
-        })
-    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1577,7 +1428,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     /// but performing a reduction could result in safe but non deterministic results.
     ///
     /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1588,6 +1440,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     /// let max = array.block_on(array.max());
     /// assert_eq!((array.len()-1)*2,max);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn max(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.max(),
@@ -1595,54 +1448,6 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
         }
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the max element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("max")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
-    /// the returned max reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
-    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
-    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
-    /// it may not be your desired behavior.
-    ///
-    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
-    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
-    /// but performing a reduction could result in safe but non deterministic results.
-    ///
-    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let max = array.blocking_max();
-    /// assert_eq!((array.len()-1)*2,max);
-    ///```
-    pub fn blocking_max(&self) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_max` from within an async context which may lead to deadlock, it is recommended that you use `max().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        self.block_on(match self {
-            AtomicArray::NativeAtomicArray(array) => array.max(),
-            AtomicArray::GenericAtomicArray(array) => array.max(),
-        })
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1664,6 +1469,9 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     ///
     /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
     ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
+    ///
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1674,60 +1482,13 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     /// let min = array.block_on(array.min());
     /// assert_eq!(0,min);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn min(&self) -> AmHandle<Option<T>> {
         match self {
             AtomicArray::NativeAtomicArray(array) => array.min(),
             AtomicArray::GenericAtomicArray(array) => array.min(),
         }
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the min element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("min")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
-    /// the returned min reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
-    /// not with respect to the entire global array. This means that while one PE is performing a reduction, other PEs can atomically update their local
-    /// elements. While this is technically safe with respect to the integrity of an indivdual element (and with respect to the compiler),
-    /// it may not be your desired behavior.
-    ///
-    /// To be clear this behavior is not an artifact of lamellar, but rather the language itself,
-    /// for example if you have an `Arc<Vec<AtomicUsize>>` shared on multiple threads, you could safely update the elements from each thread,
-    /// but performing a reduction could result in safe but non deterministic results.
-    ///
-    /// In Lamellar converting to a [ReadOnlyArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let min = array.blocking_min();
-    /// assert_eq!(0,min);
-    ///```
-    pub fn blocking_min(&self) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `AtomicArray::blocking_min` from within an async context which may lead to deadlock, it is recommended that you use `min().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        self.block_on(match self {
-            AtomicArray::NativeAtomicArray(array) => array.min(),
-            AtomicArray::GenericAtomicArray(array) => array.min(),
-        })
-    }
 }
 
 impl<T: Dist> LamellarWrite for AtomicArray<T> {}
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 0d182608..60d3ea08 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -18,7 +18,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -30,7 +30,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -46,7 +46,7 @@ impl<T: Dist> LamellarArrayGet<T> for GenericAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -68,7 +68,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GenericAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -83,7 +83,7 @@ impl<T: Dist> LamellarArrayPut<T> for GenericAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 33f464e6..35fbdb44 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -1225,7 +1225,23 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GlobalLockArray<T> {
 #[pin_project]
 pub struct GlobalLockArrayReduceHandle<T: Dist + AmDist> {
     req: AmHandle<Option<T>>,
-    lock_guard: GlobalRwDarcReadGuard<()>,
+    lock_guard: GlobalLockReadGuard<T>,
+}
+
+impl<T: Dist + AmDist> GlobalLockArrayReduceHandle<T> {
+    /// This method will spawn the associated Array Reduce Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<Option<T>> {
+        self.lock_guard.array.clone().spawn(self)
+    }
+
+    /// This method will block the caller until the associated Array Reduce Operation completes
+    pub fn block(self) -> Option<T> {
+        self.lock_guard.array.clone().block_on(self)
+    }
 }
 
 impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
@@ -1264,7 +1280,8 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
     ///
     /// # Safety
     /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][GlobalLockArrayReduceHandle::spawn] or [blocked on][GlobalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1277,53 +1294,12 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
     /// let read_guard = array.blocking_read_lock();
     /// let prod = array.block_on(read_guard.reduce("prod"));
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn reduce(self, op: &str) -> GlobalLockArrayReduceHandle<T> {
         GlobalLockArrayReduceHandle {
             req: self.array.array.reduce_data(op, self.array.clone().into()),
-            lock_guard: self.lock_guard.clone(),
-        }
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// Please see the documentation for the [register_reduction] procedural macro for
-    /// more details and examples on how to create your own reductions.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
-    ///
-    /// # Safety
-    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let prod = read_guard.blocking_reduce("prod");
-    ///```
-    pub fn blocking_reduce(self, op: &str) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
+            lock_guard: self,
         }
-        self.array.block_on(GlobalLockArrayReduceHandle {
-            req: self.array.array.reduce_data(op, self.array.clone().into()),
-            lock_guard: self.lock_guard.clone(),
-        })
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
@@ -1338,7 +1314,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     ///
     /// # Safety
     /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][GlobalLockArrayReduceHandle::spawn] or [blocked on][GlobalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1350,37 +1327,11 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// let read_guard = array.blocking_read_lock();
     /// let sum = array.block_on(read_guard.sum());
     /// ```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn sum(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("sum")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("sum")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
-    /// the returned sum reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let sum = read_guard.blocking_sum();
-    /// ```
-    pub fn blocking_sum(self) -> Option<T> {
-        self.blocking_reduce("sum")
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1392,7 +1343,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     ///
     /// # Safety
     /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][GlobalLockArrayReduceHandle::spawn] or [blocked on][GlobalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1404,36 +1356,10 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// let prod = array.block_on(read_guard.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn prod(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("prod")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
-    /// the returned prod reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let prod = read_guard.blocking_prod();
-    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
-    ///```
-    pub fn blocking_prod(self) -> Option<T> {
-        self.blocking_reduce("prod")
-    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuard<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1447,7 +1373,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     ///
     /// # Safety
     /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][GlobalLockArrayReduceHandle::spawn] or [blocked on][GlobalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1459,37 +1386,11 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// let max = array.block_on(read_guard.max());
     /// assert_eq!((array.len()-1)*2,max);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn max(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("max")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the max element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("max")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
-    /// the returned max reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let max = read_guard.blocking_max();
-    /// assert_eq!((array.len()-1)*2,max);
-    ///```
-    pub fn blocking_max(self) -> Option<T> {
-        self.blocking_reduce("max")
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1501,7 +1402,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     ///
     /// # Safety
     /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][GlobalLockArrayReduceHandle::spawn] or [blocked on][GlobalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1513,54 +1415,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// let min = array.block_on(read_guard.min());
     /// assert_eq!(0,min);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn min(self) -> GlobalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the min element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("min")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
-    /// the returned min reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let min = read_guard.blocking_min();
-    /// assert_eq!(0,min);
-    ///```
-    pub fn blocking_min(self) -> Option<T> {
-        self.blocking_reduce("min")
-    }
 }
-
-// impl<T: Dist + serde::ser::Serialize + serde::de::DeserializeOwned + 'static> LamellarArrayReduce<T>
-//     for GlobalLockArray<T>
-// {
-//     fn get_reduction_op(&self, op: String) -> LamellarArcAm {
-//         self.array.get_reduction_op(op)
-//     }
-//     fn reduce(&self, op: &str) -> Box<dyn LamellarRequest<Output = T>  > {
-//         self.reduce(op)
-//     }
-//     fn sum(&self) -> Box<dyn LamellarRequest<Output = T>  > {
-//         self.sum()
-//     }
-//     fn max(&self) -> Box<dyn LamellarRequest<Output = T>  > {
-//         self.max()
-//     }
-//     fn prod(&self) -> Box<dyn LamellarRequest<Output = T>  > {
-//         self.prod()
-//     }
-// }
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index 7a19c9d4..dfa53282 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -28,7 +28,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -40,7 +40,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -56,7 +56,7 @@ impl<T: Dist> LamellarArrayGet<T> for GlobalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -78,7 +78,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GlobalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -93,7 +93,7 @@ impl<T: Dist> LamellarArrayPut<T> for GlobalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/handle.rs b/src/array/handle.rs
index 210058d1..9dc48795 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -11,15 +11,32 @@ use crate::{
     active_messaging::{AmHandle, LocalAmHandle},
     array::LamellarByteArray,
     lamellar_request::LamellarRequest,
+    scheduler::LamellarTask,
     Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
 };
 
 /// a task handle for an array rdma (put/get) operation
 pub struct ArrayRdmaHandle {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<AmHandle<()>>,
 }
 
+impl ArrayRdmaHandle {
+    /// This method will spawn the associated Array RDMA Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<()> {
+        self.array.team().spawn(self)
+    }
+
+    /// This method will block the calling thread until the associated Array RDMA Operation completes
+    pub fn block(self) -> () {
+        self.array.team().block_on(self)
+    }
+}
+
 impl LamellarRequest for ArrayRdmaHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(0..) {
@@ -56,11 +73,27 @@ impl Future for ArrayRdmaHandle {
 /// a task handle for an array rdma 'at' operation
 #[pin_project]
 pub struct ArrayRdmaAtHandle<T: Dist> {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: Option<LocalAmHandle<()>>,
     pub(crate) buf: OneSidedMemoryRegion<T>,
 }
 
+impl<T: Dist> ArrayRdmaAtHandle<T> {
+    /// This method will spawn the associated Array RDMA at Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.array.team().spawn(self)
+    }
+
+    /// This method will block the calling thread until the associated Array RDMA at Operation completes
+    pub fn block(self) -> T {
+        self.array.team().block_on(self)
+    }
+}
+
 impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
     fn blocking_wait(self) -> Self::Output {
         match self.req {
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 5d64b18c..41cbb23f 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -49,7 +49,7 @@ use std::pin::Pin;
 use std::sync::Arc;
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ; [$($blocking_ret:tt)*]) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ) => {
         fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($return_type)*
         where
            $($bounds)+
@@ -68,48 +68,6 @@ macro_rules! consumer_impl {
             {
                 self.as_inner().[<$name _with_schedule>](sched, $($arg),*)
             }
-
-            // fn [<spawn_ $name >]<$($generics),*>(
-            //     &self,
-            //     $($arg : $arg_ty),*
-            // )   -> LamellarTask<$($blocking_ret)*>
-            // where
-            //     $($bounds)+
-            // {
-            //     self.as_inner().[<spawn_ $name >]($($arg),*)
-            // }
-
-            // fn [<spawn_ $name _with_schedule >]<$($generics),*>(
-            //     &self,
-            //     sched: Schedule,
-            //     $($arg : $arg_ty),*
-            // ) -> LamellarTask<$($blocking_ret)*>
-            // where
-            //     $($bounds)+
-            // {
-            //     self.as_inner().[<spawn_ $name _with_schedule>](sched, $($arg),*)
-            // }
-
-            // fn [<blocking_ $name >]<$($generics),*>(
-            //     &self,
-            //     $($arg : $arg_ty),*
-            // )   -> $($blocking_ret)*
-            // where
-            //     $($bounds)+
-            // {
-            //     self.as_inner().[<blocking_ $name >]($($arg),*)
-            // }
-
-            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-            //     &self,
-            //     sched: Schedule,
-            //     $($arg : $arg_ty),*
-            // )  -> $($blocking_ret)*
-            // where
-            //     $($bounds)+
-            // {
-            //     self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
-            // }
         }
     };
 }
@@ -119,44 +77,39 @@ pub trait DistIteratorLauncher: InnerArray {
     consumer_impl!(
         for_each<I, F>(iter: &I, op: F);
         [DistIterForEachHandle];
-        [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
-        [()]
+        [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static]
+
     );
     consumer_impl!(
-        for_each_async<I, F, Fut>(iter: &I, op: F);
-        [DistIterForEachHandle];
-        [I: DistributedIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
-        [()]);
+    for_each_async<I, F, Fut>(iter: &I, op: F);
+    [DistIterForEachHandle];
+    [I: DistributedIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static]
+    );
 
     consumer_impl!(
         reduce<I, F>(iter: &I, op: F);
         [DistIterReduceHandle<I::Item, F>];
-        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
-        [Option<I::Item>]);
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static]);
 
     consumer_impl!(
         collect<I, A>(iter: &I, d: Distribution);
         [DistIterCollectHandle<I::Item, A>];
-        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static];
-        [A]);
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static]);
 
     consumer_impl!(
         collect_async<I, A, B>(iter: &I, d: Distribution);
         [DistIterCollectHandle<B, A>];
-        [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
-        [A]);
+        [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,]);
 
     consumer_impl!(
         count<I>(iter: &I);
         [DistIterCountHandle];
-        [I: DistributedIterator + 'static ];
-        [usize]);
+        [I: DistributedIterator + 'static ]);
 
     consumer_impl!(
         sum<I>(iter: &I);
         [DistIterSumHandle<I::Item>];
-        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
-        [I::Item]);
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ]);
 
     //#[doc(hidden)]
     fn global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
@@ -389,66 +342,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each(self, op)
     }
 
-    // /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array
-    // ///
-    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note
-    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let _ = array
-    // ///     .dist_iter()
-    // ///     .for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
-    // /// array.wait_all(); //wait for the iteration to complete
-    // ///
-    // ///```
-    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
-    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
-    // somepoint to ensure the iteration has completed"]
-    // fn spawn_for_each<F>(&self, op: F) -> LamellarTask<()>
-    // where
-    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    // {
-    //     self.array().spawn_for_each(self, op)
-    // }
-
-    // /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array
-    // ///
-    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// The iteration will have been completed by the time this function returns
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array
-    // ///     .dist_iter()
-    // ///     .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()))
-    // /// );
-    // ///```
-    // fn blocking_for_each<F>(&self, op: F)
-    // where
-    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_for_each(self, op)
-    // }
-
     /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
@@ -488,84 +381,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array
-    // ///
-    // /// The supplied closure must return a future.
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note
-    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let iter = array.dist_iter().spawn_for_each_async(|elem| async move {
-    // ///     async_std::task::yield_now().await;
-    // ///     println!("{:?} {elem}",std::thread::current().id())
-    // /// });
-    // /// world.block_on(iter);
-    // /// ```
-    // /// essentially the for_each_async call gets converted into (on each thread)
-    // ///```ignore
-    // /// for fut in array.iter(){
-    // ///     fut.await;
-    // /// }
-    // ///```
-    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
-    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
-    // somepoint to ensure the iteration has completed"]
-    // fn spawn_for_each_async<F, Fut>(&self, op: F) -> LamellarTask<()>
-    // where
-    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     self.array().spawn_for_each_async(self, op)
-    // }
-
-    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array).
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array
-    // ///
-    // /// The supplied closure must return a future.
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// Iteration is completed by the time this function returns
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.dist_iter().blocking_for_each_async(|elem| async move {
-    // ///     async_std::task::yield_now().await;
-    // ///     println!("{:?} {elem}",std::thread::current().id())
-    // /// });
-    // /// ```
-    // /// essentially the for_each_async call gets converted into (on each thread)
-    // ///```ignore
-    // /// for fut in array.iter(){
-    // ///     fut.await;
-    // /// }
-    // ///```
-    // fn blocking_for_each_async<F, Fut>(&self, op: F)
-    // where
-    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     self.array().blocking_for_each_async(self, op)
-    // }
-
     /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
@@ -590,56 +405,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_with_schedule(sched, self, op)
     }
 
-    //  /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array
-    // ///
-    // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note
-    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
-    // /// array.wait_all();
-    // ///```
-    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
-    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
-    // somepoint to ensure the iteration has completed"]
-    // fn spawn_for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> LamellarTask<()>
-    // where
-    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    // {
-    //     self.array().spawn_for_each_with_schedule(sched, self, op)
-    // }
-
-    // /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array
-    // ///
-    // /// Iteration is completed by the time this function returns
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.dist_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
-    // ///```
-    // fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F)
-    // where
-    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_for_each_with_schedule(sched, self, op)
-    // }
-
     /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
@@ -675,74 +440,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async_with_schedule(sched, self, op)
     }
 
-    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
-    // /// as each PE will only process elements local to itself
-    // ///
-    // /// The supplied closure must return a future.
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// This function returns a future which can be used to poll for completion of the iteration.
-    // /// # Note
-    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.dist_iter().spawn_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
-    // ///     async_std::task::yield_now().await;
-    // ///     println!("{:?} {elem}",std::thread::current().id())
-    // /// });
-    // /// array.wait_all();
-    // ///```
-    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
-    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
-    // somepoint to ensure the iteration has completed"]
-    // fn spawn_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> LamellarTask<()>
-    // where
-    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     self.array().spawn_for_each_async_with_schedule(sched, self, op)
-    // }
-
-    // /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
-    // /// as each PE will only process elements local to itself
-    // ///
-    // /// The supplied closure must return a future.
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// Iteration is completed by the time this function returns
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.dist_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
-    // ///     async_std::task::yield_now().await;
-    // ///     println!("{:?} {elem}",std::thread::current().id())
-    // /// });
-    // ///```
-    // fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
-    // where
-    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     self.array().blocking_for_each_async_with_schedule(sched, self, op)
-    // }
-
     /// Reduces the elements of the dist iterator using the provided closure
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
@@ -770,59 +467,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().reduce(self, op)
     }
 
-    // /// Reduces the elements of the dist iterator using the provided closure
-    // ///
-    // /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
-    // ///
-    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // /// # Note
-    // /// Calling this function launches the iteration regardless of if the returned future is used or not.
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let req = array.dist_iter().spawn_reduce(|acc,elem| acc+elem);
-    // /// let sum = array.block_on(req); //wait on the collect request to get the new array
-    // ///```
-    // #[must_use = "The iteration has already been launched. Await this future to wait for completion and retrieve the result.
-    // You can use 'let _ = spawn_[iterator]` to supress the warning, but likely will want to also call '<the_array>.wait_all()' at
-    // somepoint to ensure the iteration has completed"]
-    // fn spawn_reduce<F>(&self, op: F) -> LamellarTask<Option<Self::Item>>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    // {
-    //     self.array().spawn_reduce(self, op)
-    // }
-
-    // /// Reduces the elements of the dist iterator using the provided closure
-    // ///
-    // /// The function returns the reduced value
-    // ///
-    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let req = array.dist_iter().blocking_reduce(|acc,elem| acc+elem);
-    // ///```
-    // fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_reduce(self, op)
-    // }
-
     /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the  reduced value.
@@ -849,52 +493,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().reduce_with_schedule(sched, self, op)
     }
 
-    // /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// This function returns a future which needs to be driven to completion to retrieve the  reduced value.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let req = array.dist_iter().reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
-    // /// let sum = array.block_on(req); //wait on the collect request to get the new array
-    // ///```
-    // #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
-    // fn spawn_reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> DistIterReduceHandle<Self::Item, F>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    // {
-    //     self.array().reduce_with_schedule(sched, self, op)
-    // }
-
-    // /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// This function returns the reduced value.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let req = array.dist_iter().blocking_reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);//wait on the collect request to get the new array
-    // ///```
-    // fn blocking_reduce_with_schedule<F>(&self, sched: Schedule, op: F) -> Option<Self::Item>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_reduce_with_schedule(sched, self, op)
-    // }
-
     /// Collects the elements of the distributed iterator into a new LamellarArray
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -932,39 +530,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect(self, d)
     }
 
-    // /// Collects the elements of the distributed iterator into a new LamellarArray
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    // ///
-    // /// This function returns the new LamellarArray upon completion.
-    // ///
-    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    // /// distribute data across the PEs as evenly as possible.
-    // ///
-    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let new_array = array.dist_iter()
-    // ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
-    // ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    // ///                .blocking_collect::<AtomicArray<usize>>(Distribution::Block);
-    // ///```
-    // fn blocking_collect<A>(&self, d: Distribution) -> A
-    // where
-    //     // &'static Self: DistributedIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect(self, d)
-    // }
-
     /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -1004,38 +569,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_with_schedule(sched, self, d)
     }
 
-    // /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    // ///
-    // /// This function returns the new LamellarArray upon completion.
-    // ///
-    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    // /// distribute data across the PEs as evenly as possible.
-    // ///
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let new_array = array.dist_iter()
-    // ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
-    // ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    // ///                .blocking_collect_with_scheduler::<AtomicArray<usize>>(Schedule::Dynamic, Distribution::Block);
-    // ///```
-    // fn blocking_collect_with_schedule<A>(&self,sched: Schedule, d: Distribution) -> A
-    // where
-    //     // &'static Self: DistributedIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect_with_schedule(sched,self, d)
-    // }
-
     /// Collects the awaited elements of the distributed iterator into a new LamellarArray
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -1084,50 +617,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_async(self, d)
     }
 
-    // /// Collects the awaited elements of the distributed iterator into a new LamellarArray
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    // ///
-    // /// Each element from the iterator must return a Future
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// The function returns the new LamellarArray upon completion.
-    // ///
-    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    // /// distribute data across the PEs as evenly as possible.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// // initialize a world and an atomic array
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// // clone the array; this doesn't duplicate the underlying
-    // /// // data but it does create a second pointer that we can
-    // /// // discard when necessary
-    // /// let array_clone = array.clone();
-    // ///
-    // /// // run collect
-    // /// let _new_array
-    // ///     = array_clone.dist_iter().map(
-    // ///         move |elem|
-    // ///         array_clone
-    // ///             .fetch_add(elem.load(),1000))
-    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    // ///```
-    // fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
-    // where
-    //     // &'static Self: DistributedIterator + 'static,
-    //     T: Dist + ArrayOps,
-    //     Self::Item: Future<Output = T> + Send + 'static,
-    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect_async(self, d)
-    // }
-
     /// Collects the awaited elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -1180,50 +669,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_async_with_schedule(sched, self, d)
     }
 
-    // /// Collects the awaited elements of the distributed iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    // ///
-    // /// Each element from the iterator must return a Future
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// The function returns the new LamellarArray upon completion.
-    // ///
-    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    // /// distribute data across the PEs as evenly as possible.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// // initialize a world and an atomic array
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// // clone the array; this doesn't duplicate the underlying
-    // /// // data but it does create a second pointer that we can
-    // /// // discard when necessary
-    // /// let array_clone = array.clone();
-    // ///
-    // /// // run collect
-    // /// let _new_array
-    // ///     = array_clone.dist_iter().map(
-    // ///         move |elem|
-    // ///         array_clone
-    // ///             .fetch_add(elem.load(),1000))
-    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    // ///```
-    // fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
-    // where
-    //     // &'static Self: DistributedIterator + 'static,
-    //     T: Dist + ArrayOps,
-    //     Self::Item: Future<Output = T> + Send + 'static,
-    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect_async_with_schedule(sched,self, d)
-    // }
-
     /// Counts the number of the elements of the distriubted iterator
     ///
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
@@ -1246,25 +691,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().count(self)
     }
 
-    // /// Counts the number of the elements of the distributed iterator
-    // ///
-    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    // ///
-    // /// This function returns the count upon completion.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count();
-    // ///```
-    // fn blocking_count(&self) -> usize {
-    //     self.array().blocking_count(self)
-    // }
-
     /// Counts the number of the elements of the distriubted iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
@@ -1286,25 +712,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().count_with_schedule(sched, self)
     }
 
-    // /// Counts the number of the elements of the distributed iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    // ///
-    // /// This function returns the count upon completion.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let cnt = array.dist_iter().filter(|elem|  elem < 10).blocking_count_with_schedule(Schedule::Dynamic);
-    // ///```
-    // fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
-    //     self.array().blocking_count_with_schedule(sched, self)
-    // }
-
     /// Sums the elements of the distributed iterator.
     ///
     /// Takes each element, adds them together, and returns the result.
@@ -1334,32 +741,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().sum(self)
     }
 
-    // /// Sums the elements of the distributed iterator.
-    // ///
-    // /// Takes each element, adds them together, and returns the result.
-    // ///
-    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    // ///
-    // /// An empty iterator returns the zero value of the type.
-    // ///
-    // /// This function returns the sum upon completion.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let sum = array.dist_iter().blocking_sum();
-    // ///```
-    // fn blocking_sum(&self) -> Self::Item
-    // where
-    //     Self::Item: Dist + ArrayOps + std::iter::Sum,
-    // {
-    //     self.array().blocking_sum(self)
-    // }
-
     /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Takes each element, adds them together, and returns the result.
@@ -1388,32 +769,6 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     {
         self.array().sum_with_schedule(sched, self)
     }
-
-    // /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// Takes each element, adds them together, and returns the result.
-    // ///
-    // /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
-    // ///
-    // /// An empty iterator returns the zero value of the type.
-    // ///
-    // /// This function returns the sum upon completion.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let sum = array.dist_iter().blocking_sum_with_schedule(Schedule::Guided);
-    // ///```
-    // fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
-    // where
-    //     Self::Item: Dist + ArrayOps + std::iter::Sum,
-    // {
-    //     self.array().blocking_sum_with_schedule(sched, self)
-    // }
 }
 
 /// An interface for dealing with distributed iterators which are indexable, meaning it returns an iterator of known length
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 549b7c23..01a32f2d 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -291,9 +291,17 @@ where
             state: State::Barrier(barrier_handle, inner),
         }
     }
+
+    /// This method will block until the associated Collect operation completes and returns the result
     pub fn block(self) -> A {
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated Collect Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<A> {
         self.team.clone().scheduler.spawn_task(self)
     }
@@ -385,7 +393,7 @@ where
     I::Item: Dist + ArrayOps,
     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
-    async fn exec(&self) -> Vec<I::Item> {
+    async fn exec(&self) -> Vec<(usize, I::Item)> {
         let iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
         iter.collect::<Vec<_>>()
     }
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index a383d6fa..196cc810 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -212,9 +212,17 @@ impl DistIterCountHandle {
             state: State::Barrier(barrier_handle, inner),
         }
     }
+
+    /// This method will block until the associated Count operation completes and returns the result
     pub fn block(self) -> usize {
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated Count Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<usize> {
         self.team.clone().scheduler.spawn_task(self)
     }
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index 9c3b9e6e..b893e82a 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -215,9 +215,16 @@ impl DistIterForEachHandle {
             state: State::Barrier(barrier, reqs),
         }
     }
+
+    /// This method will block until the associated For Each operation completes and returns the result
     pub fn block(self) {
         self.team.clone().block_on(self);
     }
+    /// This method will spawn the associated  For Each Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<()> {
         self.team.clone().scheduler.spawn_task(self)
     }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 7065481f..c0359b8f 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -337,9 +337,16 @@ where
         }
     }
 
+    /// This method will block until the associated Reduce operation completes and returns the result
     pub fn block(self) -> Option<T> {
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated Reduce Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<Option<T>> {
         self.team.clone().scheduler.spawn_task(self)
     }
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index d235c5f1..75670599 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -216,9 +216,16 @@ where
         }
     }
 
+    /// This method will block until the associated Sum operation completes and returns the result
     pub fn block(self) -> T {
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated Sum Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<T> {
         self.team.clone().scheduler.spawn_task(self)
     }
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 889f5a4d..7c30d0d7 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -49,7 +49,7 @@ use std::pin::Pin;
 use std::sync::Arc;
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ; [$(-> $($blocking_ret:tt)*)? ]) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ) => {
         fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($return_type)*
         where
            $($bounds)+
@@ -68,27 +68,6 @@ macro_rules! consumer_impl {
             {
                 self.as_inner().[<$name _with_schedule>](sched, $($arg),*)
             }
-
-            // fn [<blocking_ $name >]<$($generics),*>(
-            //     &self,
-            //     $($arg : $arg_ty),*
-            // )   $(-> $($blocking_ret)*)?
-            // where
-            //     $($bounds)+
-            // {
-            //     self.as_inner().[<blocking_ $name >]($($arg),*)
-            // }
-
-            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-            //     &self,
-            //     sched: Schedule,
-            //     $($arg : $arg_ty),*
-            // )  $(-> $($blocking_ret)*)?
-            // where
-            //     $($bounds)+
-            // {
-            //     self.as_inner().[<blocking_ $name _with_schedule>](sched, $($arg),*)
-            // }
         }
     };
 }
@@ -99,44 +78,37 @@ pub trait LocalIteratorLauncher: InnerArray {
     consumer_impl!(
         for_each<I, F>(iter: &I, op: F);
         [LocalIterForEachHandle];
-        [I: LocalIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
-        []
+        [I: LocalIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static]
     );
     consumer_impl!(
         for_each_async<I, F, Fut>(iter: &I, op: F);
         [LocalIterForEachHandle];
-        [I: LocalIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static];
-        []);
+        [I: LocalIterator + 'static, F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static, Fut: Future<Output = ()> + Send + 'static]);
 
     consumer_impl!(
         reduce<I, F>(iter: &I, op: F);
         [LocalIterReduceHandle<I::Item, F>];
-        [I: LocalIterator + 'static, I::Item: SyncSend + Copy, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
-        [-> Option<I::Item>]);
+        [I: LocalIterator + 'static, I::Item: SyncSend + Copy, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static]);
 
     consumer_impl!(
         collect<I, A>(iter: &I, d: Distribution);
         [LocalIterCollectHandle<I::Item, A>];
-        [I: LocalIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static];
-        [-> A]);
+        [I: LocalIterator + 'static, I::Item: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static]);
 
     consumer_impl!(
         collect_async<I, A, B>(iter: &I, d: Distribution);
         [LocalIterCollectHandle<B, A>];
-        [I: LocalIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
-        [-> A]);
+        [I: LocalIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,]);
 
     consumer_impl!(
         count<I>(iter: &I);
         [LocalIterCountHandle];
-        [I: LocalIterator + 'static ];
-        [-> usize]);
+        [I: LocalIterator + 'static ]);
 
     consumer_impl!(
         sum<I>(iter: &I);
         [LocalIterSumHandle<I::Item>];
-        [I: LocalIterator + 'static, I::Item: SyncSend +  std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
-        [-> I::Item]);
+        [I: LocalIterator + 'static, I::Item: SyncSend +  std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ]);
 
     //#[doc(hidden)]
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
@@ -363,32 +335,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each(self, op)
     }
 
-    //  /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
-    // ///
-    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// The iteration will be complete upon return from this function
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // ///
-    // ///     array
-    // ///         .local_iter()
-    // ///         .blocking_for_each(move |elem| println!("{:?} {elem}",std::thread::current().id()));
-    // ///
-    // ///```
-    // fn blocking_for_each<F>(&self, op: F)
-    // where
-    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_for_each(self, op)
-    // }
-
     /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Scehedule][crate::array::iterator::Schedule] policy.
     ///
     /// This function returns a future which can be used to poll for completion of the iteration.
@@ -413,26 +359,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_with_schedule(sched, self, op)
     }
 
-    // /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Scehedule][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// The iteration will be complete upon return from this function
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.local_iter().blocking_for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
-    // ///```
-    // fn blocking_for_each_with_schedule<F>(&self, sched: Schedule, op: F)
-    // where
-    //     F: Fn(Self::Item) + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_for_each_with_schedule(sched, self, op)
-    // }
-
     /// Calls a closure and immediately awaits the result on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
     ///
     /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
@@ -473,42 +399,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    // /// Calls a closure and immediately awaits the result on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array).
-    // ///
-    // /// This call utilizes the [Schedule::Static][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// The supplied closure must return a future.
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// The iteration will have been completed by the time this function returns
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.local_iter().blocking_for_each_async(|elem| async move {
-    // ///     async_std::task::yield_now().await;
-    // ///     println!("{:?} {elem}",std::thread::current().id())
-    // /// });
-    // /// ```
-    // /// essentially the for_each_async call gets converted into (on each thread)
-    // ///```ignore
-    // /// for fut in array.iter(){
-    // ///     fut.await;
-    // /// }
-    // ///```
-    // fn blocking_for_each_async<F, Fut>(&self, op: F)
-    // where
-    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     self.array().blocking_for_each_async(self, op)
-    // }
-
     /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
     ///
     /// The supplied closure must return a future.
@@ -541,34 +431,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async_with_schedule(sched, self, op)
     }
 
-    // /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
-    // ///
-    // /// The supplied closure must return a future.
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// The iteration will have been completed by the time this function returns
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// array.local_iter().blocking_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
-    // ///     async_std::task::yield_now().await;
-    // ///     println!("{:?} {elem}",std::thread::current().id())
-    // /// });
-    // ///```
-    // fn blocking_for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F)
-    // where
-    //     F: Fn(Self::Item) -> Fut + SyncSend + Clone + 'static,
-    //     Fut: Future<Output = ()> + Send + 'static,
-    // {
-    //     self.array().blocking_for_each_async_with_schedule(sched, self, op)
-    // }
-
     /// Reduces the elements of the local iterator using the provided closure
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
@@ -594,28 +456,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().reduce(self, op)
     }
 
-    // /// Reduces the elements of the local iterator using the provided closure
-    // ///
-    // /// This function returns the reduced value
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let sum  = array.blocking_local_iter().reduce(|acc,elem| acc+elem);
-    // ///```
-    // fn blocking_reduce<F>(&self, op: F) -> Option<Self::Item>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: SyncSend + Copy,
-    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_reduce(self, op)
-    // }
-
     /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
@@ -645,32 +485,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().reduce_with_schedule(sched, self, op)
     }
 
-    // /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// This function returns the reduced value
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let sum = array.local_iter().blocking_reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
-    // ///```
-    // fn blocking_reduce_with_schedule<F>(
-    //     &self,
-    //     sched: Schedule,
-    //     op: F,
-    // ) -> Option<Self::Item>
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: SyncSend + Copy,
-    //     F: Fn(Self::Item, Self::Item) -> Self::Item + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_reduce_with_schedule(sched, self, op)
-    // }
-
     /// Collects the elements of the local iterator into the specified container type
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the new container.
@@ -697,29 +511,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect(self, d)
     }
 
-    // /// Collects the elements of the local iterator into the specified container type
-    // ///
-    // /// This function returns the new container
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let array_clone = array.clone();
-    // /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
-    // ///```
-    // fn blocking_collect<A>(&self, d: Distribution) ->A
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect(self, d)
-    // }
-
     /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the new container.
@@ -750,34 +541,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_with_schedule(sched, self, d)
     }
 
-    // /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// This function returns the new container
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let array_clone = array.clone();
-    // /// let new_array = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).blocking_collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
-    // ///
-    // ///``
-    // fn blocking_collect_with_schedule<A>(
-    //     &self,
-    //     sched: Schedule,
-    //     d: Distribution,
-    // ) -> A
-    // where
-    //     // &'static Self: LocalIterator + 'static,
-    //     Self::Item: Dist + ArrayOps,
-    //     A: AsyncTeamFrom<(Vec<Self::Item>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect_with_schedule(sched, self, d)
-    // }
-
     /// Collects the awaited elements of the local iterator into a new LamellarArray
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -826,50 +589,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_async(self, d)
     }
 
-    // /// Collects the awaited elements of the local iterator into a new LamellarArray
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    // ///
-    // /// Each element from the iterator must return a Future
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// The function returns the new LamellarArray upon completion.
-    // ///
-    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    // /// distribute data across the PEs as evenly as possible.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// // initialize a world and an atomic array
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// // clone the array; this doesn't duplicate the underlying
-    // /// // data but it does create a second pointer that we can
-    // /// // discard when necessary
-    // /// let array_clone = array.clone();
-    // ///
-    // /// // run collect
-    // /// let _new_array
-    // ///     = array_clone.local_iter().map(
-    // ///         move |elem|
-    // ///         array_clone
-    // ///             .fetch_add(elem.load(),1000))
-    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    // ///```
-    // fn blocking_collect_async<A, T>(&self, d: Distribution) -> A
-    // where
-    //     // &'static Self: DistributedIterator + 'static,
-    //     T: Dist + ArrayOps,
-    //     Self::Item: Future<Output = T> + Send + 'static,
-    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect_async(self, d)
-    // }
-
     /// Collects the awaited elements of the local iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
@@ -922,50 +641,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_async_with_schedule(sched, self, d)
     }
 
-    // /// Collects the awaited elements of the local iterator into a new LamellarArray,using the provided [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// Calling this function invokes an implicit barrier across all PEs in the Array.
-    // ///
-    // /// Each element from the iterator must return a Future
-    // ///
-    // /// Each thread will only drive a single future at a time.
-    // ///
-    // /// The function returns the new LamellarArray upon completion.
-    // ///
-    // /// Creating the new array potentially results in data transfers depending on the distribution mode and the fact there is no gaurantee
-    // /// that each PE will contribute an equal number of elements to the new array, and currently LamellarArrays
-    // /// distribute data across the PEs as evenly as possible.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// // initialize a world and an atomic array
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// // clone the array; this doesn't duplicate the underlying
-    // /// // data but it does create a second pointer that we can
-    // /// // discard when necessary
-    // /// let array_clone = array.clone();
-    // ///
-    // /// // run collect
-    // /// let _new_array
-    // ///     = array_clone.local_iter().map(
-    // ///         move |elem|
-    // ///         array_clone
-    // ///             .fetch_add(elem.load(),1000))
-    // ///             .blocking_collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
-    // ///```
-    // fn blocking_collect_async_with_schedule<A, T>(&self, sched: Schedule, d: Distribution) -> A
-    // where
-    //     // &'static Self: DistributedIterator + 'static,
-    //     T: Dist + ArrayOps,
-    //     Self::Item: Future<Output = T> + Send + 'static,
-    //     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + Clone + 'static,
-    // {
-    //     self.array().blocking_collect_async_with_schedule(sched,self, d)
-    // }
-
     /// Counts the number of the elements of the local iterator
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the number of elements in the local iterator
@@ -986,23 +661,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().count(self)
     }
 
-    // /// Counts the number of the elements of the local iterator
-    // ///
-    // /// This returns the number of elements in the local iterator
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let cnt = array.local_iter().blocking_count();
-    // ///```
-    // fn blocking_count(&self) -> usize {
-    //     self.array().blocking_count(self)
-    // }
-
     /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the number of elements in the local iterator
@@ -1023,23 +681,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().count_with_schedule(sched, self)
     }
 
-    // /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// This returns the number of elements in the local iterator
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let cnt = array.local_iter().blocking_count_with_schedule(Schedule::Dynamic);
-    // ///```
-    // fn blocking_count_with_schedule(&self, sched: Schedule) -> usize {
-    //     self.array().blocking_count_with_schedule(sched, self)
-    // }
-
     /// Sums the elements of the local iterator.
     ///
     /// Takes each element, adds them together, and returns the result.
@@ -1067,30 +708,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().sum(self)
     }
 
-    // /// Sums the elements of the local iterator.
-    // ///
-    // /// Takes each element, adds them together, and returns the result.
-    // ///
-    // /// An empty iterator returns the zero value of the type.
-    // ///
-    // /// This function the sum upon completion.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let req = array.local_iter().blocking_sum();
-    // ///```
-    // fn blocking_sum(&self) -> Self::Item
-    // where
-    //     Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
-    // {
-    //     self.array().blocking_sum(self)
-    // }
-
     /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
     ///
     /// Takes each element, adds them together, and returns the result.
@@ -1117,30 +734,6 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     {
         self.array().sum_with_schedule(sched, self)
     }
-
-    // /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
-    // ///
-    // /// Takes each element, adds them together, and returns the result.
-    // ///
-    // /// An empty iterator returns the zero value of the type.
-    // ///
-    // /// This function returns the sum upon completion.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // ///
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
-    // ///
-    // /// let sum = array.local_iter().blocking_sum_with_schedule(Schedule::Guided);
-    // ///```
-    // fn blocking_sum_with_schedule(&self, sched: Schedule) -> Self::Item
-    // where
-    //     Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
-    // {
-    //     self.array().blocking_sum_with_schedule(sched, self)
-    // }
 }
 
 /// An interface for dealing with local iterators which are indexable, meaning it returns an iterator of known length
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 6ca03f95..3772b1c5 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -286,9 +286,16 @@ where
         }
     }
 
+    /// This method will block until the associated Collect operation completes and returns the result
     pub fn block(self) -> A {
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated Collect Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<A> {
         self.team.clone().scheduler.spawn_task(self)
     }
@@ -379,7 +386,7 @@ where
     I::Item: Dist + ArrayOps,
     A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,
 {
-    async fn exec(&self) -> Vec<I::Item> {
+    async fn exec(&self) -> Vec<(usize, I::Item)> {
         let iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
         iter.collect::<Vec<_>>()
     }
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index 2fe94ca2..1b3c9092 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -139,9 +139,16 @@ impl LocalIterCountHandle {
         }
     }
 
+    /// This method will block until the associated Count operation completes and returns the result
     pub fn block(self) -> usize {
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated Count Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<usize> {
         self.team.clone().scheduler.spawn_task(self)
     }
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index c99dc7a5..f18c2aa5 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -218,9 +218,16 @@ impl LocalIterForEachHandle {
         }
     }
 
+    /// This method will block until the associated For Each operation completes and returns the result
     pub fn block(self) {
         self.team.clone().block_on(self);
     }
+
+    /// This method will spawn the associated For Each Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<()> {
         self.team.clone().scheduler.spawn_task(self)
     }
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index dcc53bd2..ff635fa4 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -166,9 +166,17 @@ where
         }
     }
 
+    /// This method will block until the associated Reduce operation completes and returns the result
     pub fn block(self) -> Option<T> {
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated Reduce Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+
     pub fn spawn(self) -> LamellarTask<Option<T>> {
         self.team.clone().scheduler.spawn_task(self)
     }
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index 2d7e0a76..e13747b1 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -155,9 +155,15 @@ where
         }
     }
 
+    /// This method will block until the associated Sumoperation completes and returns the result
     pub fn block(self) -> T {
         self.team.clone().block_on(self)
     }
+    /// This method will spawn the associated Sum Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<T> {
         self.team.clone().scheduler.spawn_task(self)
     }
@@ -245,7 +251,7 @@ where
     I: LocalIterator + 'static,
     I::Item: SyncSend + std::iter::Sum,
 {
-    async fn exec(&self) -> Option<I::Item> {
+    async fn exec(&self) -> I::Item {
         let iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
         iter.sum::<I::Item>()
     }
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 5e12f193..1c3389f2 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -1109,7 +1109,23 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
 #[pin_project]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
     req: AmHandle<Option<T>>,
-    lock_guard: Arc<RwLockReadGuardArc<()>>,
+    lock_guard: LocalLockReadGuard<T>,
+}
+
+impl<T: Dist + AmDist> LocalLockArrayReduceHandle<T> {
+    /// This method will spawn the associated Array Reduce Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<Option<T>> {
+        self.lock_guard.array.clone().spawn(self)
+    }
+
+    /// This method will block the caller until the associated Array Reduce Operation completes
+    pub fn block(self) -> Option<T> {
+        self.lock_guard.array.clone().block_on(self)
+    }
 }
 
 impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
@@ -1152,7 +1168,8 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
     /// Remote data can change before and after the overall operation has completed.
     ///
     /// Lamellar converting to a [ReadOnlyArray] or [GlobalLockArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalLockArrayReduceHandle::spawn] or [blocked on][LocalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1165,57 +1182,12 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
     /// let read_guard = array.blocking_read_lock();
     /// let prod = array.block_on(read_guard.reduce("prod"));
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn reduce(self, op: &str) -> LocalLockArrayReduceHandle<T> {
         LocalLockArrayReduceHandle {
             req: self.array.array.reduce_data(op, self.array.clone().into()),
-            lock_guard: self.lock_guard.clone(),
-        }
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// Please see the documentation for the [register_reduction] procedural macro for
-    /// more details and examples on how to create your own reductions.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
-    ///
-    /// # Safety
-    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
-    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
-    /// Remote data can change before and after the overall operation has completed.
-    ///
-    /// Lamellar converting to a [ReadOnlyArray] or [GlobalLockArray] before the reduction is a straightforward workaround to enusre the data is not changing during the reduction.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let prod = read_guard.blocking_reduce("prod");
-    ///```
-    pub fn blocking_reduce(self, op: &str) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
+            lock_guard: self,
         }
-        self.array.block_on(LocalLockArrayReduceHandle {
-            req: self.array.array.reduce_data(op, self.array.clone().into()),
-            lock_guard: self.lock_guard.clone(),
-        })
     }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
@@ -1232,7 +1204,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
     /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
     /// Remote data can change before and after the overall operation has completed.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalLockArrayReduceHandle::spawn] or [blocked on][LocalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1244,37 +1217,11 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// let read_guard = array.blocking_read_lock();
     /// let sum = array.block_on(read_guard.sum());
     /// ```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn sum(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("sum")
     }
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("sum")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
-    /// the returned sum reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
-    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
-    /// Remote data can change before and after the overall operation has completed.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let sum = read_guard.blocking_sum();
-    /// ```
-    pub fn blocking_sum(self) -> Option<T> {
-        self.blocking_reduce("sum")
-    }
+
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1288,7 +1235,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
     /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
     /// Remote data can change before and after the overall operation has completed.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalLockArrayReduceHandle::spawn] or [blocked on][LocalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1300,38 +1248,10 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// let prod = array.block_on(read_guard.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn prod(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("prod")
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("prod")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
-    /// the returned prod reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
-    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
-    /// Remote data can change before and after the overall operation has completed.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let prod = read_guard.blocking_prod();
-    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
-    ///```
-    pub fn blocking_prod(self) -> Option<T> {
-        self.blocking_reduce("prod")
-    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -1347,7 +1267,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
     /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
     /// Remote data can change before and after the overall operation has completed.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalLockArrayReduceHandle::spawn] or [blocked on][LocalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1359,38 +1280,11 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// let max = array.block_on(read_guard.max());
     /// assert_eq!((array.len()-1)*2,max);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn max(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("max")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the max element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("max")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
-    /// the returned max reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
-    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
-    /// Remote data can change before and after the overall operation has completed.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let max = read_guard.blocking_max();
-    /// assert_eq!((array.len()-1)*2,max);
-    ///```
-    pub fn blocking_max(self) -> Option<T> {
-        self.blocking_reduce("max")
-    }
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1404,7 +1298,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
     /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
     /// Remote data can change before and after the overall operation has completed.
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][LocalLockArrayReduceHandle::spawn] or [blocked on][LocalLockArrayReduceHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1416,38 +1311,10 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// let min = array.block_on(read_guard.min());
     /// assert_eq!(0,min);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn min(self) -> LocalLockArrayReduceHandle<T> {
         self.reduce("min")
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the min element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("min")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
-    /// the returned min reduction result is only available on the calling PE
-    ///
-    /// # Safety
-    /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
-    /// Atomicity of data on remote PEs is only guaranteed while the remote operation is executing on the remote PE (once it has captured that PEs local lock).
-    /// Remote data can change before and after the overall operation has completed.
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
-    /// let read_guard = array.blocking_read_lock();
-    /// let min = read_guard.blocking_min();
-    /// assert_eq!(0,min);
-    ///```
-    pub fn blocking_min(self) -> Option<T> {
-        self.blocking_reduce("min")
-    }
 }
 
 // impl<T: Dist + serde::ser::Serialize + serde::de::DeserializeOwned + 'static> LamellarArrayReduce<T>
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index 71958071..c6b44edf 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -21,7 +21,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -33,7 +33,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -49,7 +49,7 @@ impl<T: Dist> LamellarArrayGet<T> for LocalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -71,7 +71,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for LocalLockArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -86,7 +86,7 @@ impl<T: Dist> LamellarArrayPut<T> for LocalLockArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index 76ab43f5..499ae56f 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -18,7 +18,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -30,7 +30,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
             buf: buf.clone().into(),
         });
         ArrayRdmaAtHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
         }
@@ -45,7 +45,7 @@ impl<T: Dist> LamellarArrayGet<T> for NativeAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -67,7 +67,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for NativeAtomicArray<T> {
             buf: buf.into(),
         });
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
         }
     }
@@ -82,7 +82,7 @@ impl<T: Dist> LamellarArrayPut<T> for NativeAtomicArray<T> {
         match buf.team_try_into(&self.array.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/array/operations/handle.rs b/src/array/operations/handle.rs
index 27f01e3d..97926f8b 100644
--- a/src/array/operations/handle.rs
+++ b/src/array/operations/handle.rs
@@ -1,6 +1,7 @@
 use crate::{
     array::{AmDist, LamellarByteArray},
     lamellar_request::LamellarRequest,
+    scheduler::LamellarTask,
     AmHandle,
 };
 
@@ -15,13 +16,27 @@ use pin_project::pin_project;
 
 /// a task handle for a batched array operation that doesnt return any values
 pub struct ArrayBatchOpHandle {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<(AmHandle<()>, Vec<usize>)>,
 }
-
 /// a task handle for a single array operation that doesnt return any values
 pub type ArrayOpHandle = ArrayBatchOpHandle;
 
+impl ArrayBatchOpHandle {
+    /// This method will spawn the associated Array Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<()> {
+        self.array.team().spawn(self)
+    }
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> () {
+        self.array.team().block_on(self)
+    }
+}
+
 impl LamellarRequest for ArrayBatchOpHandle {
     fn blocking_wait(mut self) -> Self::Output {
         for req in self.reqs.drain(0..) {
@@ -57,10 +72,26 @@ impl Future for ArrayBatchOpHandle {
 
 /// a task handle for a single array operation that returns a value
 pub struct ArrayFetchOpHandle<R: AmDist> {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: AmHandle<Vec<R>>,
 }
 
+impl<R: AmDist> ArrayFetchOpHandle<R> {
+    /// This method will spawn the associated Array Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<R> {
+        self.array.team().spawn(self)
+    }
+
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> R {
+        self.array.team().block_on(self)
+    }
+}
+
 impl<R: AmDist> LamellarRequest for ArrayFetchOpHandle<R> {
     fn blocking_wait(self) -> Self::Output {
         self.req
@@ -89,15 +120,31 @@ impl<R: AmDist> Future for ArrayFetchOpHandle<R> {
 /// a task handle for a batched array operation that return values
 #[pin_project]
 pub struct ArrayFetchBatchOpHandle<R: AmDist> {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>,
     results: Vec<R>,
 }
 
+impl<R: AmDist> ArrayFetchBatchOpHandle<R> {
+    /// This method will spawn the associated Array Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<Vec<R>> {
+        self.array.team().spawn(self)
+    }
+
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> Vec<R> {
+        self.array.team().block_on(self)
+    }
+}
+
 impl<R: AmDist> From<ArrayFetchBatchOpHandle<R>> for ArrayFetchOpHandle<R> {
     fn from(mut req: ArrayFetchBatchOpHandle<R>) -> Self {
         Self {
-            _array: req._array,
+            array: req.array,
             req: req.reqs.pop_front().unwrap().0,
         }
     }
@@ -114,7 +161,7 @@ impl<R: AmDist> ArrayFetchBatchOpHandle<R> {
             results.set_len(max_index);
         }
         Self {
-            _array: array,
+            array: array,
             reqs,
             results,
         }
@@ -174,10 +221,26 @@ impl<R: AmDist> Future for ArrayFetchBatchOpHandle<R> {
 
 /// a task handle for a single array operation that returns a result
 pub struct ArrayResultOpHandle<R: AmDist> {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: AmHandle<Vec<Result<R, R>>>,
 }
 
+impl<R: AmDist> ArrayResultOpHandle<R> {
+    /// This method will spawn the associated Array Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<Result<R, R>> {
+        self.array.team().spawn(self)
+    }
+
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> Result<R, R> {
+        self.array.team().block_on(self)
+    }
+}
+
 impl<R: AmDist> LamellarRequest for ArrayResultOpHandle<R> {
     fn blocking_wait(self) -> Self::Output {
         self.req
@@ -206,15 +269,31 @@ impl<R: AmDist> Future for ArrayResultOpHandle<R> {
 /// a task handle for a batched array operation that returns results
 #[pin_project]
 pub struct ArrayResultBatchOpHandle<R: AmDist> {
-    pub(crate) _array: LamellarByteArray, //prevents prematurely performing a local drop
+    pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<(AmHandle<Vec<Result<R, R>>>, Vec<usize>)>,
     results: Vec<Result<R, R>>,
 }
 
+impl<R: AmDist> ArrayResultBatchOpHandle<R> {
+    /// This method will spawn the associated Array Operation on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
+    pub fn spawn(self) -> LamellarTask<Vec<Result<R, R>>> {
+        self.array.team().spawn(self)
+    }
+
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> Vec<Result<R, R>> {
+        self.array.team().block_on(self)
+    }
+}
+
 impl<R: AmDist> From<ArrayResultBatchOpHandle<R>> for ArrayResultOpHandle<R> {
     fn from(mut req: ArrayResultBatchOpHandle<R>) -> Self {
         Self {
-            _array: req._array,
+            array: req.array,
             req: req.reqs.pop_front().unwrap().0,
         }
     }
@@ -231,7 +310,7 @@ impl<R: AmDist> ArrayResultBatchOpHandle<R> {
             results.set_len(max_index);
         }
         Self {
-            _array: array,
+            array: array,
             reqs,
             results,
         }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index f79d22ba..be11bd94 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -6,7 +6,6 @@ use crate::array::private::ArrayExecAm;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
 use crate::barrier::BarrierHandle;
-use crate::config;
 use crate::darc::DarcMode;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
@@ -446,7 +445,8 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
     /// the returned reduction result is only available on the calling PE  
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -464,50 +464,10 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.array.reduce_data(op, self.clone().into())
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// Please see the documentation for the [register_reduction] procedural macro for
-    /// more details and examples on how to create your own reductions.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
-    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let sum = array.blocking_reduce("sum"); // equivalent to calling array.sum()
-    /// assert_eq!(array.len()*num_pes,sum);
-    ///```
-    pub fn blocking_reduce(&self, op: &str) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `ReadOnlyArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        self.block_on(self.array.reduce_data(op, self.clone().into()))
-    }
 }
 impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -518,7 +478,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
     /// the returned sum reduction result is only available on the calling PE
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -535,39 +496,11 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// let sum = array.block_on(array.sum());
     /// assert_eq!(array.len()*num_pes,sum);
     /// ```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("sum")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
-    /// the returned sum reduction result is only available on the calling PE
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
-    ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let sum = array.blocking_sum();
-    /// assert_eq!(array.len()*num_pes,sum);
-    /// ```
-    pub fn blocking_sum(&self) -> Option<T> {
-        self.blocking_reduce("sum")
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -576,7 +509,8 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
     /// the returned prod reduction result is only available on the calling PE
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -591,36 +525,10 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// let prod =  array.block_on(array.prod());
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("prod")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
-    /// the returned prod reduction result is only available on the calling PE
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
-    ///     elem.store(i+1);
-    /// });
-    /// array.wait_all();
-    /// array.barrier();
-    /// let prod =  array.blocking_prod();
-    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
-    ///```
-    pub fn blocking_prod(&self) -> Option<T> {
-        self.blocking_reduce("prod")
-    }
 }
 impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     #[doc(alias("One-sided", "onesided"))]
@@ -631,7 +539,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
     /// the returned max reduction result is only available on the calling PE
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -643,34 +552,11 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// let max = array.block_on(array.max());
     /// assert_eq!((array.len()-1)*2,max);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the max element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("max")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
-    /// the returned max reduction result is only available on the calling PE
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let max = array.blocking_max();
-    /// assert_eq!((array.len()-1)*2,max);
-    ///```
-    pub fn blocking_max(&self) -> Option<T> {
-        self.blocking_reduce("max")
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -679,7 +565,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
     /// the returned min reduction result is only available on the calling PE
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -691,33 +578,10 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// let min = array.block_on(array.min());
     /// assert_eq!(0,min);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the min element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("min")`.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
-    /// the returned min reduction result is only available on the calling PE
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let min = array.blocking_min();
-    /// assert_eq!(0,min);
-    ///```
-    pub fn blocking_min(&self) -> Option<T> {
-        self.blocking_reduce("min")
-    }
 }
 
 impl<T: Dist> private::ArrayExecAm<T> for ReadOnlyArray<T> {
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 663f8168..a87a407f 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1324,7 +1324,8 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
     /// the returned reduction result is only available on the calling PE  
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1344,57 +1345,11 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
     /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub unsafe fn reduce(&self, op: &str) -> AmHandle<Option<T>> {
         self.reduce_data(op, self.clone().into())
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// Please see the documentation for the [register_reduction][lamellar_impl::register_reduction] procedural macro for
-    /// more details and examples on how to create your own reductions.
-    ///
-    /// # Safety
-    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
-    /// Any updates to local data are not guaranteed to be Atomic.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
-    ///     let req = array.local_iter().for_each(move |_| {
-    ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///        array_clone.add(index,1); //randomly at one to an element in the array.
-    ///     });
-    /// }
-    /// array.wait_all();
-    /// array.barrier();
-    /// let sum = array.blocking_reduce("sum"); // equivalent to calling array.sum()
-    /// //assert_eq!(array.len()*num_pes,sum); // may or may not fail
-    ///```
-    pub unsafe fn blocking_reduce(&self, op: &str) -> Option<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `UnsafeArray::blocking_reduce` from within an async context which may lead to deadlock, it is recommended that you use `reduce(...).await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        self.block_on(self.reduce_data(op, self.clone().into()))
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1407,7 +1362,8 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
     /// the returned sum reduction result is only available on the calling PE  
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1427,45 +1383,11 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let sum = array.block_on(unsafe{array.sum()}); //Safe in this instance as we have ensured no updates are currently happening
     /// // assert_eq!(array.len()*num_pes,sum);//this may or may not fail
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub unsafe fn sum(&self) -> AmHandle<Option<T>> {
         self.reduce("sum")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a sum reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("sum")`.
-    ///
-    /// # Safety
-    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
-    /// Any updates to local data are not guaranteed to be Atomic.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
-    /// the returned sum reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,1000000,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
-    ///     let req = array.local_iter().for_each(move |_| {
-    ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///        array_clone.add(index,1); //randomly at one to an element in the array.
-    ///     });
-    /// }
-    /// array.wait_all();
-    /// array.barrier();
-    /// let sum = unsafe{array.blocking_sum()};
-    ///```
-    pub unsafe fn blocking_sum(&self) -> Option<T> {
-        self.blocking_reduce("sum")
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
     ///
@@ -1478,7 +1400,8 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
     /// the returned prod reduction result is only available on the calling PE  
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1497,47 +1420,11 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let prod = unsafe{ array.block_on(array.prod())};
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub unsafe fn prod(&self) -> AmHandle<Option<T>> {
         self.reduce("prod")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Perform a production reduction on the entire distributed array, returning the value to the calling PE.
-    ///
-    /// This equivalent to `reduce("prod")`.
-    ///
-    /// # Safety
-    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
-    /// Any updates to local data are not guaranteed to be Atomic.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
-    /// the returned prod reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// use rand::Rng;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
-    /// unsafe {
-    ///     let req = array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
-    ///         *elem = i+1;
-    ///     });
-    /// }
-    /// array.print();
-    /// array.wait_all();
-    /// array.print();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// array.print();
-    /// let prod =  unsafe{array.blocking_prod()};
-    /// assert_eq!((1..=array.len()).product::<usize>(),prod);
-    ///```
-    pub unsafe fn blocking_prod(&self) -> Option<T> {
-        self.blocking_reduce("prod")
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Find the max element in the entire destributed array, returning to the calling PE
     ///
@@ -1550,7 +1437,8 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
     /// the returned max reduction result is only available on the calling PE  
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1565,40 +1453,11 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let max = array.block_on(max_req);
     /// assert_eq!((array.len()-1)*2,max);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub unsafe fn max(&self) -> AmHandle<Option<T>> {
         self.reduce("max")
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the max element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("max")`.
-    ///
-    /// # Safety
-    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
-    /// Any updates to local data are not guaranteed to be Atomic.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
-    /// the returned max reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2)}; //safe as we are accessing in a data parallel fashion
-    /// array.wait_all();
-    /// array.barrier();
-    /// let max = unsafe{array.blocking_max()};
-    /// assert_eq!((array.len()-1)*2,max);
-    ///```
-    pub unsafe fn blocking_max(&self) -> Option<T> {
-        self.blocking_reduce("max")
-    }
-
     #[doc(alias("One-sided", "onesided"))]
     /// Find the min element in the entire destributed array, returning to the calling PE
     ///
@@ -1611,7 +1470,8 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
     /// the returned min reduction result is only available on the calling PE  
-    ///
+    /// # Note
+    /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
     /// ```
     /// use lamellar::array::prelude::*;
@@ -1626,39 +1486,10 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let min = array.block_on(min_req);
     /// assert_eq!(0,min);
     ///```
+    #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub unsafe fn min(&self) -> AmHandle<Option<T>> {
         self.reduce("min")
     }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Find the min element in the entire destributed array, returning to the calling PE
-    ///
-    /// This equivalent to `reduce("min")`.
-    ///
-    /// # Safety
-    /// Data in UnsafeArrays are always unsafe as there are no protections on how remote PE's or local threads may access this PE's local data.
-    /// Any updates to local data are not guaranteed to be Atomic.
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
-    /// the returned min reduction result is only available on the calling PE  
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
-    /// let array_clone = array.clone();
-    /// unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2)}; //safe as we are accessing in a data parallel fashion
-    /// array.wait_all();
-    /// array.barrier();
-    /// let min = unsafe{array.blocking_min()};
-    /// assert_eq!(0,min);
-    ///```
-    pub unsafe fn blocking_min(&self) -> Option<T> {
-        self.blocking_reduce("min")
-    }
 }
 
 impl UnsafeArrayInnerWeak {
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index ce281b56..6f527564 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -28,7 +28,7 @@ impl InnerArray for UnsafeArrayInner {
 impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {}
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [ $($blocking_ret:tt)*] ) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*] ) => {
         paste! {
             fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $return_type$(<$($ret_gen),*>)?
             where
@@ -60,69 +60,6 @@ macro_rules! consumer_impl {
                 $return_type::new(barrier,reqs_future,self)
             }
 
-            // fn [<spawn_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) -> LamellarTask<$($blocking_ret)*>
-            // where
-            // $($bounds)+
-            // {
-
-            //     self.[<spawn_ $name _with_schedule>](Schedule::Static, $($arg),*)
-            // }
-
-
-            // fn [<spawn_ $name _with_schedule >]<$($generics),*>(
-            //     &self,
-            //     sched: Schedule,
-            //     $($arg : $arg_ty),*
-            // ) -> LamellarTask<$($blocking_ret)*>
-            // where
-            //     $($bounds)+
-            // {
-            //     self.data.team.scheduler.spawn_task(self.[<$name _with_schedule>](sched, $($arg),*))
-            // }
-
-            // fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($blocking_ret)*
-            // where
-            // $($bounds)+
-            // {
-
-            //     self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
-            // }
-
-
-            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-            //     &self,
-            //     sched: Schedule,
-            //     $($arg : $arg_ty),*
-            // ) -> $($blocking_ret)*
-            // where
-            //     $($bounds)+
-            // {
-            //     if std::thread::current().id() != *crate::MAIN_THREAD {
-            //         let name = stringify!{$name};
-            //         let msg = format!("
-            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead!
-            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            //         );
-            //         if let Some(val) = config().blocking_call_warning {
-            //             if val {
-            //                 println!("{msg}");
-            //             }
-            //         } else {
-            //             println!("{msg}");
-            //         }
-            //     }
-            //     let am = $($am)*;
-            //     self.data.team.barrier.tasking_barrier();
-            //     let inner = self.clone();
-            //     let reqs = match sched {
-            //         Schedule::Static => inner.sched_static(am),
-            //         Schedule::Dynamic => inner.sched_dynamic(am),
-            //         Schedule::Chunk(size) => inner.sched_chunk(am,size),
-            //         Schedule::Guided => inner.sched_guided(am),
-            //         Schedule::WorkStealing => inner.sched_work_stealing(am),
-            //     };
-            //     reqs.blocking_wait()
-            // }
         }
     };
 }
@@ -155,8 +92,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
             iter: iter.iter_clone(Sealed),
             op,
         }
-    ];
-    [()]);
+    ]);
 
     consumer_impl!(
         for_each_async<I, F, Fut>(iter: &I, op: F);
@@ -167,68 +103,62 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ];
-        [()]
+        ]
     );
 
     consumer_impl!(
-        reduce<I, F>( iter: &I, op: F);
-        [DistIterReduceHandle<I::Item, F>];
-        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
-        [
-            Reduce {
-                iter: iter.iter_clone(Sealed),
-                op,
-            }
-        ];
-        [Option<I::Item>]);
+    reduce<I, F>( iter: &I, op: F);
+    [DistIterReduceHandle<I::Item, F>];
+    [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
+    [
+        Reduce {
+            iter: iter.iter_clone(Sealed),
+            op,
+        }
+    ]);
 
     consumer_impl!(
-        collect<I, A>( iter: &I, d: Distribution);
-        [DistIterCollectHandle<I::Item, A>];
-        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
-        [
-            Collect {
-                iter: iter.iter_clone(Sealed).monotonic(),
-                distribution: d,
-                _phantom: PhantomData,
-            }
-        ];
-        [A]);
+    collect<I, A>( iter: &I, d: Distribution);
+    [DistIterCollectHandle<I::Item, A>];
+    [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
+    [
+        Collect {
+            iter: iter.iter_clone(Sealed).monotonic(),
+            distribution: d,
+            _phantom: PhantomData,
+        }
+    ]);
     consumer_impl!(
-        collect_async<I, A, B>( iter: &I, d: Distribution);
-        [DistIterCollectHandle<B, A>];
-        [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
-        [
-            CollectAsync {
-                iter: iter.iter_clone(Sealed).monotonic(),
-                distribution: d,
-                _phantom: PhantomData,
-            }
-        ];
-        [A]);
+    collect_async<I, A, B>( iter: &I, d: Distribution);
+    [DistIterCollectHandle<B, A>];
+    [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
+    [
+        CollectAsync {
+            iter: iter.iter_clone(Sealed).monotonic(),
+            distribution: d,
+            _phantom: PhantomData,
+        }
+    ]);
 
     consumer_impl!(
-        count<I>( iter: &I);
-        [DistIterCountHandle];
-        [I: DistributedIterator + 'static ];
-        [
-            Count {
-                iter: iter.iter_clone(Sealed),
-            }
-        ];
-        [usize]);
+    count<I>( iter: &I);
+    [DistIterCountHandle];
+    [I: DistributedIterator + 'static ];
+    [
+        Count {
+            iter: iter.iter_clone(Sealed),
+        }
+    ]);
 
     consumer_impl!(
-        sum<I>(iter: &I);
-        [DistIterSumHandle<I::Item>];
-        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
-        [
-            Sum {
-                iter: iter.iter_clone(Sealed),
-            }
-        ];
-        [I::Item]);
+    sum<I>(iter: &I);
+    [DistIterSumHandle<I::Item>];
+    [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
+    [
+        Sum {
+            iter: iter.iter_clone(Sealed),
+        }
+    ]);
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 53cc482f..ff52ac26 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -17,7 +17,7 @@ use std::sync::Arc;
 impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {}
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [$(-> $($blocking_ret:tt)*)?] ) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*] ) => {
         paste! {
             fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $return_type$(<$($ret_gen),*>)?
             where
@@ -47,49 +47,6 @@ macro_rules! consumer_impl {
                 }});
                 $return_type::new(reqs_future,self)
             }
-
-            // fn [<blocking_ $name>]<$($generics),*>(&self, $($arg : $arg_ty),*) $(-> $($blocking_ret)*)?
-            // where
-            // $($bounds)+
-            // {
-
-            //     self.[<blocking_ $name _with_schedule>](Schedule::Static, $($arg),*)
-            // }
-
-
-            // fn [<blocking_ $name _with_schedule >]<$($generics),*>(
-            //     &self,
-            //     sched: Schedule,
-            //     $($arg : $arg_ty),*
-            // ) $(-> $($blocking_ret)*)?
-            // where
-            //     $($bounds)+
-            // {
-            //     if std::thread::current().id() != *crate::MAIN_THREAD {
-            //         let name = stringify!{$name};
-            //         let msg = format!("
-            //             [LAMELLAR WARNING] You are calling `blocking_{name}[_with_schedule]` from within an async context which may lead to deadlock, it is recommended that you use `{name}[_with_schedule]().await;` instead!
-            //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            //         );
-            //         if let Some(val) = config().blocking_call_warning {
-            //             if val {
-            //                 println!("{msg}");
-            //             }
-            //         } else {
-            //             println!("{msg}");
-            //         }
-            //     }
-            //     let am = $($am)*;
-            //     let inner = self.clone();
-            //     let reqs = match sched {
-            //         Schedule::Static => inner.sched_static(am),
-            //         Schedule::Dynamic => inner.sched_dynamic(am),
-            //         Schedule::Chunk(size) => inner.sched_chunk(am,size),
-            //         Schedule::Guided => inner.sched_guided(am),
-            //         Schedule::WorkStealing => inner.sched_work_stealing(am),
-            //     };
-            //     reqs.blocking_wait()
-            // }
         }
     };
 }
@@ -121,8 +78,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ];
-        []
+        ]
     );
 
     consumer_impl!(
@@ -134,8 +90,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ];
-        []
+        ]
     );
 
     consumer_impl!(
@@ -147,8 +102,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ];
-        [-> Option<I::Item>]
+        ]
     );
 
     consumer_impl!(
@@ -161,8 +115,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 distribution: d,
                 _phantom: PhantomData,
             }
-        ];
-        [-> A]
+        ]
     );
 
     consumer_impl!(
@@ -175,8 +128,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 distribution: d,
                 _phantom: PhantomData,
             }
-        ];
-        [-> A]
+        ]
     );
 
     consumer_impl!(
@@ -187,20 +139,18 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
             Count {
                 iter: iter.iter_clone(Sealed),
             }
-        ];
-        [-> usize]
+        ]
     );
 
     consumer_impl!(
-        sum<I>(iter: &I);
-        [LocalIterSumHandle<I::Item>];
-        [I: LocalIterator + 'static, I::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
-        [
-            Sum {
-                iter: iter.iter_clone(Sealed),
-            }
-        ];
-        [-> I::Item]);
+    sum<I>(iter: &I);
+    [LocalIterSumHandle<I::Item>];
+    [I: LocalIterator + 'static, I::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
+    [
+        Sum {
+            iter: iter.iter_clone(Sealed),
+        }
+    ]);
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index d43087f4..60cf85fe 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -347,7 +347,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             VecDeque::new()
         };
         ArrayBatchOpHandle {
-            _array: byte_array,
+            array: byte_array,
             reqs: res,
         }
     }
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index c290b62f..8279d46c 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -631,7 +631,7 @@ impl<T: Dist> UnsafeArray<T> {
         match buf.team_try_into(&self.team_rt()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
@@ -641,7 +641,7 @@ impl<T: Dist> UnsafeArray<T> {
         let buf: OneSidedMemoryRegion<T> = self.team_rt().alloc_one_sided_mem_region(1);
         self.blocking_get(index, &buf);
         ArrayRdmaAtHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             req: None,
             buf: buf,
         }
@@ -734,7 +734,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for UnsafeArray<T> {
             reqs
         };
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: reqs,
         }
     }
@@ -755,7 +755,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for UnsafeArray<T> {
             Distribution::Cyclic => self.cyclic_op(ArrayRdmaCmd::PutAm, index, buf.into()),
         };
         ArrayRdmaHandle {
-            _array: self.as_lamellar_byte_array(),
+            array: self.as_lamellar_byte_array(),
             reqs: reqs,
         }
     }
@@ -770,7 +770,7 @@ impl<T: Dist> LamellarArrayPut<T> for UnsafeArray<T> {
         match buf.team_try_into(&self.team_rt()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
-                _array: self.as_lamellar_byte_array(),
+                array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
             },
         }
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 08552158..40590b49 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -116,6 +116,19 @@ impl<T: AmDist> TaskGroupAmHandle<T> {
             }
         }
     }
+
+    /// This method will spawn the associated Active Message on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.inner.scheduler.clone().spawn_task(self)
+    }
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> T {
+        self.inner.scheduler.clone().block_on(self)
+    }
 }
 
 impl<T: AmDist> LamellarRequest for TaskGroupAmHandle<T> {
@@ -276,6 +289,19 @@ impl<T: AmDist> TaskGroupMultiAmHandle<T> {
             }
         }
     }
+
+    /// This method will spawn the associated Active Message on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
+    pub fn spawn(self) -> LamellarTask<Vec<T>> {
+        self.inner.scheduler.clone().spawn_task(self)
+    }
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> Vec<T> {
+        self.inner.scheduler.clone().block_on(self)
+    }
 }
 
 impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
@@ -412,6 +438,21 @@ impl<T: 'static> TaskGroupLocalAmHandle<T> {
     }
 }
 
+impl<T: Send + 'static> TaskGroupLocalAmHandle<T> {
+    /// This method will spawn the associated Active Message on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
+    pub fn spawn(self) -> LamellarTask<T> {
+        self.inner.scheduler.clone().spawn_task(self)
+    }
+    /// This method will block the calling thread until the associated Array Operation completes
+    pub fn block(self) -> T {
+        self.inner.scheduler.clone().block_on(self)
+    }
+}
+
 impl<T: 'static> LamellarRequest for TaskGroupLocalAmHandle<T> {
     fn blocking_wait(self) -> Self::Output {
         let mut res = self.inner.data.lock().remove(&self.sub_id);
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 7ff125ad..e31e7225 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1364,6 +1364,15 @@ impl LamellarTeamRT {
             .fetch_sub(cnt, Ordering::SeqCst);
     }
 
+    pub(crate) fn spawn<F>(&self, task: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+        self.scheduler.spawn_task(task)
+    }
+
     //#[tracing::instrument(skip_all)]
     pub(crate) fn wait_all(&self) {
         let mut exec_task = true;
diff --git a/src/scheduler.rs b/src/scheduler.rs
index e9111120..155624ef 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -126,7 +126,7 @@ impl<T> Drop for LamellarTaskInner<T> {
             }
             LamellarTaskInner::AsyncStdTask(_task) => {}
             #[cfg(feature = "tokio-executor")]
-            LamellarTaskInner::TokioTask(task) => {}
+            LamellarTaskInner::TokioTask(_task) => {}
         }
     }
 }
@@ -145,7 +145,10 @@ impl<T> Future for LamellarTaskInner<T> {
                 }
                 LamellarTaskInner::AsyncStdTask(task) => Pin::new_unchecked(task).poll(cx),
                 #[cfg(feature = "tokio-executor")]
-                LamellarTaskInner::TokioTask(task) => Pin::new_unchecked(task).poll(cx),
+                LamellarTaskInner::TokioTask(task) => match Pin::new_unchecked(task).poll(cx) {
+                    Poll::Pending => Poll::Pending,
+                    Poll::Ready(res) => Poll::Ready(res.expect("tokio task failed")),
+                },
             }
         }
     }
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index ab6f102d..64c016e2 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -105,7 +105,7 @@ macro_rules! add_test{
 
             for idx in 0..array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                    let _ = array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
                 }
             }
             array.wait_all();
@@ -134,7 +134,7 @@ macro_rules! add_test{
             indices.shuffle(&mut rng);
             for idx in indices.iter() {//0..num_updates{
                 // let idx = rand_idx.sample(&mut rng);
-                let _ = array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                let _ = array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
             }
             array.wait_all();
             array.barrier();
@@ -169,7 +169,7 @@ macro_rules! add_test{
             sub_array.barrier();
             for idx in 0..sub_array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                    let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
                 }
             }
             sub_array.wait_all();
@@ -195,7 +195,7 @@ macro_rules! add_test{
             indices.shuffle(&mut rng);
             for idx in indices.iter(){ // in 0..num_updates{
                 // let idx = rand_idx.sample(&mut rng);
-                let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -231,7 +231,7 @@ macro_rules! add_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(pe_max_val as usize){
-                        let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                        let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
                     }
                 }
                 sub_array.wait_all();
@@ -257,7 +257,7 @@ macro_rules! add_test{
                 indices.shuffle(&mut rng);
                 for idx in indices.iter() {//0..num_updates{
                     // let idx = rand_idx.sample(&mut rng);
-                    let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t);
+                    let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
@@ -349,51 +349,51 @@ macro_rules! input_test{
             input_array.print();
             //individual T------------------------------
             for i in 0..array.len(){
-                let _ = array.batch_add(i,1);
+                let _ = array.batch_add(i,1).spawn();
             }
             check_results!($array,array,num_pes,"T");
             println!("passed T");
             //individual T------------------------------
             for i in 0..array.len(){
-                let _ = array.batch_add(&i,1);
+                let _ = array.batch_add(&i,1).spawn();
             }
             check_results!($array,array,num_pes,"&T");
             println!("passed &T");
             //&[T]------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
             let slice = &vec[..];
-            let _ = array.batch_add(slice,1);
+            let _ = array.batch_add(slice,1).spawn();
             check_results!($array,array,num_pes,"&[T]");
             println!("passed &[T]");
             //scoped &[T]------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
                 let slice = &vec[..];
-                let _ = array.batch_add(slice,1);
+                let _ = array.batch_add(slice,1).spawn();
             }
             check_results!($array,array,num_pes,"scoped &[T]");
             println!("passed scoped &[T]");
             // Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
-            let _ = array.batch_add(vec,1);
+            let _ = array.batch_add(vec,1).spawn();
             check_results!($array,array,num_pes,"Vec<T>");
             println!("passed Vec<T>");
             // &Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
-            let _ = array.batch_add(&vec,1);
+            let _ = array.batch_add(&vec,1).spawn();
             check_results!($array,array,num_pes,"&Vec<T>");
             println!("passed &Vec<T>");
             // Scoped Vec<T>------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                let _ = array.batch_add(vec,1);
+                let _ = array.batch_add(vec,1).spawn();
             }
             check_results!($array,array,num_pes,"scoped Vec<T>");
             println!("passed scoped Vec<T>");
             // Scoped &Vec<T>------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                let _ = array.batch_add(&vec,1);
+                let _ = array.batch_add(&vec,1).spawn();
             }
             check_results!($array,array,num_pes,"scoped &Vec<T>");
             println!("passed scoped &Vec<T>");
@@ -406,7 +406,7 @@ macro_rules! input_test{
                 for i in 0..array.len(){
                     slice[i]=i;
                 }
-                let _ = array.batch_add(slice,1);
+                let _ = array.batch_add(slice,1).spawn();
                 check_results!($array,array,num_pes,"LMR<T>");
                 println!("passed LMR<T>");
             }
@@ -421,7 +421,7 @@ macro_rules! input_test{
                     slice[i]=i;
                 }
 
-                let _ = array.batch_add(slice,1);
+                let _ = array.batch_add(slice,1).spawn();
                 check_results!($array,array,num_pes,"SMR<T>");
                 println!("passed SMR<T>");
             }
@@ -430,7 +430,7 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"UnsafeArray<T>");
             // UnsafeArray<T>------------------------------
-            let _ = array.batch_add(unsafe{input_array.local_data()},1);
+            let _ = array.batch_add(unsafe{input_array.local_data()},1).spawn();
             check_results!($array,array,num_pes,"&UnsafeArray<T>");
             println!("passed &UnsafeArray<T>");
 
@@ -439,7 +439,7 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"ReadOnlyArray<T>");
             // ReadOnlyArray<T>------------------------------
-            let _ = array.batch_add(input_array.local_data(),1);
+            let _ = array.batch_add(input_array.local_data(),1).spawn();
             check_results!($array,array,num_pes,"&ReadOnlyArray<T>");
             println!("passed &ReadOnlyArray<T>");
 
@@ -448,7 +448,7 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"AtomicArray<T>");
             // AtomicArray<T>------------------------------
-            let _ = array.batch_add(&input_array.local_data(),1);
+            let _ = array.batch_add(&input_array.local_data(),1).spawn();
             check_results!($array,array,num_pes,"&AtomicArray<T>");
             println!("passed &AtomicArray<T>");
 
@@ -457,7 +457,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
-            let _ = array.batch_add(&input_array.blocking_read_local_data(),1);
+            let _ = array.batch_add(&input_array.blocking_read_local_data(),1).spawn();
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -466,7 +466,7 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
-            let _ = array.batch_add(&input_array.blocking_read_local_data(),1);
+            let _ = array.batch_add(&input_array.blocking_read_local_data(),1).spawn();
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index d3250255..2b4a3229 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -96,7 +96,7 @@ macro_rules! div_test{
             // array.print();
             for idx in 0..array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ = array.div(idx,2 as $t);
+                    let _ = array.div(idx,2 as $t).spawn();
                 }
             }
             array.wait_all();
@@ -123,7 +123,7 @@ macro_rules! div_test{
             // // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ = sub_array.div(idx,2 as $t);
+                    let _ = sub_array.div(idx,2 as $t).spawn();
                 }
             }
             sub_array.wait_all();
@@ -149,7 +149,7 @@ macro_rules! div_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(max_updates as usize){
-                        let _ = sub_array.div(idx,2 as $t);
+                        let _ = sub_array.div(idx,2 as $t).spawn();
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index c6abedca..c3a19b56 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -103,7 +103,7 @@ macro_rules! mul_test{
             // array.print();
             for idx in 0..array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ = array.mul(idx,2 as $t);
+                    let _ = array.mul(idx,2 as $t).spawn();
                 }
             }
             array.wait_all();
@@ -130,7 +130,7 @@ macro_rules! mul_test{
             // // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ =  sub_array.mul(idx,2 as $t);
+                    let _ =  sub_array.mul(idx,2 as $t).spawn();
                 }
             }
             sub_array.wait_all();
@@ -156,7 +156,7 @@ macro_rules! mul_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(max_updates as usize){
-                        let _ = sub_array.mul(idx,2 as $t);
+                        let _ = sub_array.mul(idx,2 as $t).spawn();
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index f216540f..f3b78a7b 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -104,7 +104,7 @@ macro_rules! sub_test{
 
                         for idx in 0..array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = array.sub(idx,1 as $t);
+                    let _ = array.sub(idx,1 as $t).spawn();
                 }
             }
             array.wait_all();
@@ -126,7 +126,7 @@ macro_rules! sub_test{
 
                         for _i in 0..num_updates  as usize{
                 let idx = rand_idx.sample(&mut rng);
-                let _ = array.sub(idx,1 as $t);
+                let _ = array.sub(idx,1 as $t).spawn();
             }
             array.wait_all();
             array.barrier();
@@ -152,7 +152,7 @@ macro_rules! sub_test{
                         // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = sub_array.sub(idx,1 as $t);
+                    let _ = sub_array.sub(idx,1 as $t).spawn();
                 }
             }
             sub_array.wait_all();
@@ -174,7 +174,7 @@ macro_rules! sub_test{
 
                         for _i in 0..num_updates as usize{
                 let idx = rand_idx.sample(&mut rng);
-                let _ = sub_array.sub(idx,1 as $t);
+                let _ = sub_array.sub(idx,1 as $t).spawn();
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -200,7 +200,7 @@ macro_rules! sub_test{
                 sub_array.barrier();
                                 for idx in 0..sub_array.len(){
                     for _i in 0..(pe_max_val as usize){
-                        let _ = sub_array.sub(idx,1 as $t);
+                        let _ = sub_array.sub(idx,1 as $t).spawn();
                     }
                 }
                 sub_array.wait_all();
@@ -222,7 +222,7 @@ macro_rules! sub_test{
 
                                 for _i in 0..num_updates as usize{
                     let idx = rand_idx.sample(&mut rng);
-                    let _ = sub_array.sub(idx,1 as $t);
+                    let _ = sub_array.sub(idx,1 as $t).spawn();
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index 3e84dfea..88ccfaaa 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -81,7 +81,7 @@ macro_rules! load_store_test{
             array.barrier();
             for idx in 0..array.len(){
                 if idx%num_pes == my_pe{
-                    let _ = array.store(idx,my_pe as $t);
+                    let _ = array.store(idx,my_pe as $t).spawn();
                 }
             }
             array.wait_all();
@@ -114,7 +114,7 @@ macro_rules! load_store_test{
             sub_array.barrier();
             for idx in 0..sub_array.len(){
                 if idx%num_pes == my_pe{
-                    let _ = sub_array.store(idx,my_pe as $t);
+                    let _ = sub_array.store(idx,my_pe as $t).spawn();
                 }
             }
             sub_array.wait_all();
@@ -150,7 +150,7 @@ macro_rules! load_store_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        let _ = sub_array.store(idx,my_pe as $t);
+                        let _ = sub_array.store(idx,my_pe as $t).spawn();
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 2006fcad..94ed932d 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -84,7 +84,7 @@ macro_rules! and_test{
             array.barrier();
             let my_val = !(1 as $t << my_pe);
             for idx in 0..array.len(){
-                let _ = array.bit_and(idx,my_val);
+                let _ = array.bit_and(idx,my_val).spawn();
 
             }
             array.wait_all();
@@ -112,7 +112,7 @@ macro_rules! and_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                let _ = sub_array.bit_and(idx,my_val);
+                let _ = sub_array.bit_and(idx,my_val).spawn();
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -140,7 +140,7 @@ macro_rules! and_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    let _ = sub_array.bit_and(idx,my_val);
+                    let _ = sub_array.bit_and(idx,my_val).spawn();
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index 37c87dad..fa52c7a3 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -84,7 +84,7 @@ macro_rules! or_test{
             array.barrier();
             let my_val = 1 as $t << my_pe;
             for idx in 0..array.len(){
-                let _ = array.bit_or(idx,my_val);
+                let _ = array.bit_or(idx,my_val).spawn();
 
             }
             array.wait_all();
@@ -112,7 +112,7 @@ macro_rules! or_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                let _ = sub_array.bit_or(idx,my_val);
+                let _ = sub_array.bit_or(idx,my_val).spawn();
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -140,7 +140,7 @@ macro_rules! or_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    let _ = sub_array.bit_or(idx,my_val);
+                    let _ = sub_array.bit_or(idx,my_val).spawn();
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/xor_test.rs b/tests/array/bitwise_ops/xor_test.rs
index 55902542..4bbe6472 100644
--- a/tests/array/bitwise_ops/xor_test.rs
+++ b/tests/array/bitwise_ops/xor_test.rs
@@ -82,7 +82,7 @@ macro_rules! xor_test{
             array.barrier();
             let my_val = 1 as $t << my_pe;
             for idx in 0..array.len(){
-                let _ = array.bit_xor(idx,my_val);
+                let _ = array.bit_xor(idx,my_val).spawn();
 
             }
             array.wait_all();
@@ -110,7 +110,7 @@ macro_rules! xor_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                let _ = sub_array.bit_xor(idx,my_val);
+                let _ = sub_array.bit_xor(idx,my_val).spawn();
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -138,7 +138,7 @@ macro_rules! xor_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    let _ = sub_array.bit_xor(idx,my_val);
+                    let _ = sub_array.bit_xor(idx,my_val).spawn();
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/rdma/get_test.rs b/tests/array/rdma/get_test.rs
index b03ca058..e14dc414 100644
--- a/tests/array/rdma/get_test.rs
+++ b/tests/array/rdma/get_test.rs
@@ -140,7 +140,7 @@ macro_rules! get_test{
                 let num_txs = mem_seg_len/tx_size;
                 for tx in (0..num_txs){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(mem_seg_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
-                    unsafe {let _ = array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size))).spawn();}
                 }
                 array.wait_all();
                 array.barrier();
@@ -180,7 +180,7 @@ macro_rules! get_test{
                 let num_txs = half_len/tx_size;
                 for tx in (0..num_txs){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(half_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)).as_slice());}
-                    unsafe {let _ = sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size))).spawn();}
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
@@ -225,7 +225,7 @@ macro_rules! get_test{
                     let num_txs = len/tx_size;
                     for tx in (0..num_txs){
                         // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
-                        unsafe {let _ = sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size))); }
+                        unsafe {let _ = sub_array.get(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size))).spawn(); }
                     }
                     sub_array.wait_all();
                     sub_array.barrier();
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index 12c21a08..f5643207 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -78,7 +78,7 @@ macro_rules! put_test{
                 for tx in (my_pe..num_txs).step_by(num_pes){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(mem_seg_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
                     #[allow(unused_unsafe)]
-                    unsafe {let _ = array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size))).spawn();}
                 }
                 array.wait_all();
                 array.barrier();
@@ -112,7 +112,7 @@ macro_rules! put_test{
                 for tx in (my_pe..num_txs).step_by(num_pes){
                     // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(half_len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)).as_slice());}
                     #[allow(unused_unsafe)]
-                    unsafe {let _ = sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size)));}
+                    unsafe {let _ = sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(half_len,(tx+1)*tx_size))).spawn();}
                 }
                 array.wait_all();
                 sub_array.barrier();
@@ -149,7 +149,7 @@ macro_rules! put_test{
                     for tx in (my_pe..num_txs).step_by(num_pes){
                         // unsafe{println!("tx_size {:?} tx {:?} sindex: {:?} eindex: {:?} {:?}",tx_size,tx, tx*tx_size,std::cmp::min(len,(tx+1)*tx_size),&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(mem_seg_len,(tx+1)*tx_size)).as_slice());}
                         #[allow(unused_unsafe)]
-                        unsafe {let _ = sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size)));}
+                        unsafe {let _ = sub_array.put(tx*tx_size,&shared_mem_region.sub_region(tx*tx_size..std::cmp::min(len,(tx+1)*tx_size))).spawn();}
                     }
                     array.wait_all();
                     sub_array.barrier();

From 8a2ff9820fa10d076fd5409dfe82f34e430d384e Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 26 Jul 2024 15:40:34 -0700
Subject: [PATCH 061/116] account for spawned tasks when shutting down executor

---
 .../active_message_examples/am_batch_tests.rs |  2 +-
 run_examples.sh                               |  2 +-
 src/barrier.rs                                |  7 +-
 src/lamellae/command_queues.rs                |  3 +
 src/lamellar_team.rs                          | 17 ++++-
 src/lamellar_world.rs                         | 72 +++++--------------
 src/scheduler.rs                              | 54 ++++++++++----
 src/scheduler/tokio_executor.rs               |  1 +
 8 files changed, 82 insertions(+), 76 deletions(-)

diff --git a/examples/active_message_examples/am_batch_tests.rs b/examples/active_message_examples/am_batch_tests.rs
index de12c2e0..68354ce7 100644
--- a/examples/active_message_examples/am_batch_tests.rs
+++ b/examples/active_message_examples/am_batch_tests.rs
@@ -396,7 +396,7 @@ fn main() {
         }
     }
     println!("issue time: {:?}", s.elapsed().as_secs_f64());
-    println!("cnts: {:?}", cnts);
+    println!("cnts: {:?} {:?}", cnts, cnts.iter().sum::<usize>());
     world.wait_all();
     println!("local finished time: {:?}", s.elapsed().as_secs_f64());
     world.barrier();
diff --git a/run_examples.sh b/run_examples.sh
index 95268d48..881b440f 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -18,7 +18,7 @@ mkdir -p ${results_dir}
 ln -s ${output_dir}/rofiverbs_lamellae rofiverbs_lamellae
 
 
-# cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
+cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
 
 
 cd rofiverbs_lamellae/${local_results_dir}
diff --git a/src/barrier.rs b/src/barrier.rs
index e15ded71..8d16b106 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -446,9 +446,10 @@ impl Barrier {
 
 // impl Drop for Barrier {
 //     fn drop(&mut self) {
-//         //println!("dropping barrier");
-//         // println!("arch: {:?}",Arc::strong_count(&self.arch));
-//         //println!("dropped barrier");
+//         println!("dropping barrier");
+//         println!("lamellae cnt: {:?}", Arc::strong_count(&self.lamellae));
+//         println!("scheduler cnt: {:?}", Arc::strong_count(&self.scheduler));
+//         println!("dropped barrier");
 //     }
 // }
 
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index bfd06d17..0ce02f17 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -1371,6 +1371,7 @@ impl CommandQueue {
             async_std::task::yield_now().await;
         }
         // println!("leaving alloc_task task {:?}", scheduler.active());
+        // println!("sechduler_new: {:?}", Arc::strong_count(&scheduler));
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1387,6 +1388,7 @@ impl CommandQueue {
             panic!("received panic from other PE");
         }
         // println!("leaving panic_task task {:?}", scheduler.active());
+        // println!("sechduler_new: {:?}", Arc::strong_count(&scheduler));
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1525,6 +1527,7 @@ impl CommandQueue {
         //     print!("{:?} ", cnt.load(Ordering::Relaxed));
         // }
         // println!("");
+        // println!("sechduler_new: {:?}", Arc::strong_count(&scheduler));
     }
 
     //#[tracing::instrument(skip_all)]
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index e31e7225..a50be34a 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1375,6 +1375,7 @@ impl LamellarTeamRT {
 
     //#[tracing::instrument(skip_all)]
     pub(crate) fn wait_all(&self) {
+        // println!("wait_all called on pe: {}", self.world_pe);
         let mut exec_task = true;
         if std::thread::current().id() != *crate::MAIN_THREAD {
             if let Some(val) = config().blocking_call_warning {
@@ -1389,6 +1390,12 @@ impl LamellarTeamRT {
             exec_task = false;
         }
         let mut temp_now = Instant::now();
+        // println!(
+        //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
+        //     self.world_pe,
+        //     self.team_counters.send_req_cnt.load(Ordering::SeqCst),
+        //     self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
+        // );
         while self.panic.load(Ordering::SeqCst) == 0
             && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
                 || (self.parent.is_none()
@@ -1409,6 +1416,12 @@ impl LamellarTeamRT {
                 temp_now = Instant::now();
             }
         }
+        // println!(
+        //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
+        //     self.world_pe,
+        //     self.team_counters.send_req_cnt.load(Ordering::SeqCst),
+        //     self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
+        // );
     }
     pub(crate) async fn await_all(&self) {
         let mut temp_now = Instant::now();
@@ -2129,7 +2142,7 @@ impl LamellarTeamRT {
 impl Drop for LamellarTeamRT {
     //#[tracing::instrument(skip_all)]
     fn drop(&mut self) {
-        // println!("LamellarTeamRT Drop");
+        println!("LamellarTeamRT Drop");
         // println!("sechduler_new: {:?}", Arc::strong_count(&self.scheduler));
         // println!("lamellae: {:?}", Arc::strong_count(&self.lamellae));
         // println!("arch: {:?}", Arc::strong_count(&self.arch));
@@ -2139,6 +2152,8 @@ impl Drop for LamellarTeamRT {
         // );
         // println!("removing {:?} ", self.team_hash);
         self.lamellae.free(self.remote_ptr_addr);
+        // println!("Lamellae Cnt: {:?}", Arc::strong_count(&self.lamellae));
+        // println!("scheduler Cnt: {:?}", Arc::strong_count(&self.scheduler));
         // println!("LamellarTeamRT dropped {:?}", self.team_hash);
         // unsafe {
         //     for duration in crate::SERIALIZE_TIMER.iter() {
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 109b18d2..727351c0 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -292,6 +292,11 @@ impl Drop for LamellarWorld {
         if cnt == 1 {
             // println!("[{:?}] world dropping", self.my_pe);
             // println!(
+            //     "lamellae cnt {:?} sched cnt {:?}",
+            //     Arc::strong_count(&self.team_rt.lamellae),
+            //     Arc::strong_count(&self.team_rt.scheduler)
+            // );
+            // println!(
             //     "in team destroy mype: {:?} cnt: {:?} {:?}",
             //     self.my_pe,
             //     self._counters.send_req_cnt.load(Ordering::SeqCst),
@@ -353,10 +358,15 @@ impl Drop for LamellarWorld {
             // println!(
             //     "team: {:?} team_rt: {:?}",
             //     Arc::strong_count(&self.team),
-            //     unsafe { Arc::strong_count(&team_rt) }
+            //     unsafe { Arc::strong_count(&self.team_rt) }
             // );
 
             // println!("counters: {:?}", Arc::strong_count(&self._counters));
+            // println!(
+            //     "sechduler_new: {:?}",
+            //     Arc::strong_count(&self.team_rt.scheduler)
+            // );
+            // println!("[{:?}] world dropped", self.my_pe);
         }
         // println!("[{:?}] world dropped", self.my_pe);
     }
@@ -434,62 +444,8 @@ impl LamellarWorldBuilder {
             "lamellar3" => ExecutorType::LamellarWorkStealing3,
             _ => panic!("[LAMELLAR WARNING]: unexpected executor type, please set LAMELLAR_EXECUTOR to one of the following 'lamellar', 'async_std', or (if tokio-executor feature is enabled, 'tokio'.")
         };
-        // let executor = match std::env::var("LAMELLAR_EXECUTOR") {
-        //     Ok(val) => {
-        //         let executor = val.parse::<usize>().unwrap();
-        //         if executor == 0 {
-        //             ExecutorType::LamellarWorkStealing
-        //         } else if executor == 1 {
-        //             #[cfg(feature = "tokio-executor")]
-        //             {
-        //                 ExecutorType::Tokio
-        //             }
-        //             #[cfg(not(feature = "tokio-executor"))]
-        //             {
-        //                 println!("[LAMELLAR WARNING]: tokio-executor selected but it is not enabled,  defaulting to lamellar work stealing executor");
-        //                 ExecutorType::LamellarWorkStealing
-        //             }
-        //         } else if executor == 2 {
-        //             ExecutorType::LamellarWorkStealing2
-        //         } else if executor == 3 {
-        //             ExecutorType::LamellarWorkStealing3
-        //         } else if executor == 4 {
-        //             ExecutorType::AsyncStd
-        //         } else {
-        //             println!("[LAMELLAR WARNING]: invalid executor selected defaulting to lamellar work stealing executor");
-        //             ExecutorType::LamellarWorkStealing
-        //         }
-        //     }
-        //     Err(_) => {
-        //         #[cfg(feature = "tokio-executor")]
-        //         {
-        //             ExecutorType::Tokio
-        //         }
-        //         #[cfg(not(feature = "tokio-executor"))]
-        //         {
-        //             ExecutorType::LamellarWorkStealing
-        //         }
-        //     }
-        // };
-        // println!("executor: {:?}", executor);
 
         let num_threads = config().threads;
-        //     let num_threads = match std::env::var("LAMELLAR_THREADS") {
-        //     Ok(n) => {
-        //         if let Ok(num_threads) = n.parse::<usize>() {
-        //             if num_threads == 0 {
-        //                 panic!("LAMELLAR_THREADS must be greater than 0");
-        //             } else if num_threads == 1 {
-        //                 num_threads
-        //             } else {
-        //                 num_threads - 1
-        //             }
-        //         } else {
-        //             panic!("LAMELLAR_THREADS must be an integer greater than 0");
-        //         }
-        //     }
-        //     Err(_) => 4,
-        // };
         LamellarWorldBuilder {
             primary_lamellae: Default::default(),
             // secondary_lamellae: HashSet::new(),
@@ -612,7 +568,11 @@ impl LamellarWorldBuilder {
             self.num_threads,
             panic.clone(),
         ));
-        // println!("{:?}: create_scheduler", timer.elapsed());
+        // println!(
+        //     " create_scheduler  cnt {:?}",
+        //     // timer.elapsed(),
+        //     Arc::strong_count(&sched_new)
+        // );
 
         // timer = std::time::Instant::now();
         let lamellae = lamellae_builder.init_lamellae(sched_new.clone());
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 155624ef..e3983cd3 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -240,11 +240,12 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
+        num_ams.fetch_add(1, Ordering::Relaxed);
+        let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
         // println!("am ptr {:p} ", &am);
         let am_future = async move {
             // let start_tid = thread::current().id();
-            num_ams.fetch_add(1, Ordering::Relaxed);
-            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
+
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
             // println!("[{:?}] submit_am {:?}", std::thread::current().id(), am_id);
             ame.process_msg(am, am_stall_mark, false).await;
@@ -270,10 +271,11 @@ impl Scheduler {
         let max_ams = self.max_ams.clone();
         let am_stall_mark = self.am_stall_mark.fetch_add(1, Ordering::Relaxed);
         let ame = self.active_message_engine.clone();
+        num_ams.fetch_add(1, Ordering::Relaxed);
+        let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
         let am_future = async move {
             // let start_tid = thread::current().id();
-            num_ams.fetch_add(1, Ordering::Relaxed);
-            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
+
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
             // println!(
             //     "[{:?}] submit_am_immediate {:?}",
@@ -334,10 +336,11 @@ impl Scheduler {
         let num_ams = self.num_ams.clone();
         let max_ams = self.max_ams.clone();
         let ame = self.active_message_engine.clone();
+        num_ams.fetch_add(1, Ordering::Relaxed);
+        let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
         let am_future = async move {
             // let start_tid = std::thread::current().id();
-            num_ams.fetch_add(1, Ordering::Relaxed);
-            let _am_id = max_ams.fetch_add(1, Ordering::Relaxed);
+
             // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
             // println!(
             //     "[{:?}] submit_remote_am {:?}",
@@ -372,7 +375,16 @@ impl Scheduler {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        self.executor.spawn_task(task)
+        let num_tasks = self.num_tasks.clone();
+        let max_tasks = self.max_tasks.clone();
+        num_tasks.fetch_add(1, Ordering::Relaxed);
+        let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+        let future = async move {
+            let result = task.await;
+            num_tasks.fetch_sub(1, Ordering::Relaxed);
+            result
+        };
+        self.executor.spawn_task(future)
     }
 
     pub(crate) fn submit_task<F>(&self, task: F)
@@ -381,10 +393,11 @@ impl Scheduler {
     {
         let num_tasks = self.num_tasks.clone();
         let max_tasks = self.max_tasks.clone();
+        num_tasks.fetch_add(1, Ordering::Relaxed);
+        let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
         let future = async move {
             // let start_tid = std::thread::current().id();
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+
             // println!(
             //     "[{:?}] execing new task {:?}",
             //     std::thread::current().id(),
@@ -412,10 +425,11 @@ impl Scheduler {
     {
         let num_tasks = self.num_tasks.clone();
         let max_tasks = self.max_tasks.clone();
+        num_tasks.fetch_add(1, Ordering::Relaxed);
+        let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
         let future = async move {
             // let start_tid = std::thread::current().id();
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+
             // println!(
             //     "[{:?}] execing new task immediate {:?}",
             //     std::thread::current().id(),
@@ -443,10 +457,11 @@ impl Scheduler {
     {
         let num_tasks = self.num_tasks.clone();
         let max_tasks = self.max_tasks.clone();
+        num_tasks.fetch_add(1, Ordering::Relaxed);
+        let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
         let future = async move {
             // let start_tid = std::thread::current().id();
-            num_tasks.fetch_add(1, Ordering::Relaxed);
-            let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
+
             // println!(
             //     "[{:?}] execing new task {:?}",
             //     std::thread::current().id(),
@@ -516,7 +531,18 @@ impl Scheduler {
     }
     pub(crate) fn shutdown(&self) {
         let mut timer = std::time::Instant::now();
-        while self.panic.load(Ordering::SeqCst) == 0 && self.num_tasks.load(Ordering::Relaxed) > 3
+        // println!(
+        //     "shutting down executor panic {:?} num_tasks {:?} max_tasks {:?} num_ams {:?} max_ams {:?}",
+        //     self.panic.load(Ordering::SeqCst),
+        //     self.num_tasks.load(Ordering::Relaxed),
+        //     self.max_tasks.load(Ordering::Relaxed),
+        //     self.num_ams.load(Ordering::Relaxed),
+        //     self.max_ams.load(Ordering::Relaxed),
+        // );
+        while self.panic.load(Ordering::SeqCst) == 0
+            && self.num_tasks.load(Ordering::Relaxed) > 3
+            && self.num_ams.load(Ordering::Relaxed) > 0
+        {}
         //TODO maybe this should be > 2
         {
             //the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index 117b36de..4a43bd0b 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -61,6 +61,7 @@ impl LamellarExecutor for TokioRt {
     // #[tracing::instrument(skip_all)]
     fn shutdown(&self) {
         // i think we just let tokio do this on drop
+        // println!("shutting down tokio runtime");
     }
 
     // #[tracing::instrument(skip_all)]

From 2b9b235d969f20aea3de9c3fb13f03c428a4417b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Sun, 28 Jul 2024 16:33:30 -0700
Subject: [PATCH 062/116] fix barrier_handle state machine indexing

---
 examples/array_examples/array_am.rs           |  31 ++--
 src/active_messaging/handle.rs                |   3 +
 src/array.rs                                  |  30 ++--
 .../distributed_iterator/consumer/count.rs    |  11 +-
 src/barrier.rs                                | 151 +++++++-----------
 src/darc.rs                                   | 133 +++++++++++----
 src/darc/global_rw_darc.rs                    |  60 +++----
 src/lamellar_team.rs                          |   2 +-
 src/memregion/one_sided.rs                    |  22 ++-
 9 files changed, 260 insertions(+), 183 deletions(-)

diff --git a/examples/array_examples/array_am.rs b/examples/array_examples/array_am.rs
index a101d056..c0a92fac 100644
--- a/examples/array_examples/array_am.rs
+++ b/examples/array_examples/array_am.rs
@@ -12,7 +12,7 @@ const ARRAY_LEN: usize = 100;
 
 #[lamellar::AmData(Clone)]
 struct RdmaAM {
-    array: UnsafeArray<u16>,
+    array: UnsafeArray<u8>,
     orig_pe: usize,
     index: usize,
 }
@@ -32,13 +32,13 @@ impl LamellarAM for RdmaAM {
         });
 
         //get the original nodes data
-        let local = lamellar::world.alloc_one_sided_mem_region::<u16>(ARRAY_LEN);
+        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
-        local_slice[ARRAY_LEN - 1] = num_pes as u16;
+        local_slice[ARRAY_LEN - 1] = num_pes as u8;
         unsafe {
             self.array.get(0, &local).await;
         }
-        // while local_slice[ARRAY_LEN - 1] == num_pes as u16 {
+        // while local_slice[ARRAY_LEN - 1] == num_pes as u8 {
         //     async_std::task::yield_now().await;
         // }
 
@@ -46,7 +46,7 @@ impl LamellarAM for RdmaAM {
         println!("\tcurrent view of remote segment on pe {:?}: {:?}..{:?}\n\tpe: {:?} updating index {:?} on pe  {:?}", self.orig_pe, &local_slice[0..max_i], &local_slice[local_slice.len()-max_i..],lamellar::current_pe, my_index, self.orig_pe);
 
         //update an element on the original node
-        local_slice[0] = lamellar::current_pe as u16;
+        local_slice[0] = lamellar::current_pe as u8;
         unsafe {
             self.array.put(my_index, &local.sub_region(0..=0)).await;
         }
@@ -65,23 +65,24 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     println!("creating array");
-    let array = UnsafeArray::<u16>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let array = UnsafeArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Block);
     println!("creating memregion");
-    let local_mem_region = world.alloc_one_sided_mem_region::<u16>(ARRAY_LEN);
+    let local_mem_region = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     println!("about to initialize array");
     array.print();
     if my_pe == 0 {
         unsafe {
             for i in local_mem_region.as_mut_slice().unwrap() {
-                *i = 255_u16;
+                *i = 255_u8;
             }
         }
         world.block_on(unsafe { array.put(0, &local_mem_region) });
     }
+
     println!("here!!! {:?}", my_pe);
     array.print();
     for i in unsafe { array.local_as_slice() } {
-        while *i != 255_u16 {
+        while *i != 255_u8 {
             std::thread::yield_now();
         }
     }
@@ -102,11 +103,13 @@ fn main() {
     world.barrier();
     let mut index = 0;
     while index < ARRAY_LEN / num_pes {
-        let _ = world.exec_am_all(RdmaAM {
-            array: array.clone(),
-            orig_pe: my_pe,
-            index: index,
-        }).spawn();
+        let _ = world
+            .exec_am_all(RdmaAM {
+                array: array.clone(),
+                orig_pe: my_pe,
+                index: index,
+            })
+            .spawn();
         index += 1;
     }
 
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index fafbf33e..bcec2722 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -67,6 +67,7 @@ impl LamellarRequestAddResult for AmHandleInner {
 /// A handle to an active messaging request that executes on a singe PE
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
+#[must_use = "active messaging handles do nothing unless polled or awaited or 'spawn()' or 'block()' are called"]
 pub struct AmHandle<T> {
     pub(crate) inner: Arc<AmHandleInner>,
     pub(crate) _phantom: std::marker::PhantomData<T>,
@@ -188,6 +189,7 @@ impl<T: AmDist> Future for AmHandle<T> {
 /// A handle to an active messaging request that executes on the local (originating) PE
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
+#[must_use = "active messaging handles do nothing unless polled or awaited or 'spawn()' or 'block()' are called"]
 pub struct LocalAmHandle<T> {
     pub(crate) inner: Arc<AmHandleInner>,
     pub(crate) _phantom: std::marker::PhantomData<T>,
@@ -315,6 +317,7 @@ pub(crate) struct MultiAmHandleInner {
 /// A handle to an active messaging request that executes on multiple PEs, returned from a call to [exec_am_all][crate::ActiveMessaging::exec_am_all]
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
+#[must_use = "active messaging handles do nothing unless polled or awaited or 'spawn()' or 'block()' are called"]
 pub struct MultiAmHandle<T> {
     pub(crate) inner: Arc<MultiAmHandleInner>,
     pub(crate) _phantom: std::marker::PhantomData<T>,
diff --git a/src/array.rs b/src/array.rs
index 35b19562..2813fe54 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -191,25 +191,25 @@ pub struct ReduceKey {
 crate::inventory::collect!(ReduceKey);
 
 // impl Dist for bool {}
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
-// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
-
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
 lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+
+// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
 lamellar_impl::generate_ops_for_bool_rt!();
 
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 196cc810..c8ee207b 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -115,10 +115,13 @@ impl InnerDistIterCountHandle {
     }
 
     fn reduce_remote_counts(&self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
-        self.team.exec_am_all(UpdateCntAm {
-            remote_cnt: local_cnt,
-            cnt: cnt.clone(),
-        });
+        let _ = self
+            .team
+            .exec_am_all(UpdateCntAm {
+                remote_cnt: local_cnt,
+                cnt: cnt.clone(),
+            })
+            .spawn();
         self.team.wait_all();
         self.team.tasking_barrier();
         cnt.load(Ordering::SeqCst)
diff --git a/src/barrier.rs b/src/barrier.rs
index 8d16b106..8c30538d 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -341,98 +341,64 @@ impl Barrier {
         self.barrier_handle().await;
     }
 
-    // pub(crate) async fn async_barrier(&self) {
-    //     let mut s = Instant::now();
-    //     if self.panic.load(Ordering::SeqCst) == 0 {
-    //         if let Some(send_buf) = &self.send_buf {
-    //             if let Ok(my_index) = self.arch.team_pe(self.my_pe) {
-    //                 let send_buf_slice = unsafe {
-    //                     // im the only thread (remote or local) that can write to this buff
-    //                     send_buf.as_mut_slice().expect("Data should exist on PE")
-    //                 };
+    //     pub(crate) async fn async_barrier(&self) {
+    //         let mut s = Instant::now();
+    //         if self.panic.load(Ordering::SeqCst) == 0 {
+    //             if let Some(send_buf) = &self.send_buf {
+    //                 if let Ok(my_index) = self.arch.team_pe(self.my_pe) {
+    //                     let send_buf_slice = unsafe {
+    //                         // im the only thread (remote or local) that can write to this buff
+    //                         send_buf.as_mut_slice().expect("Data should exist on PE")
+    //                     };
 
-    //                 let barrier_id = self.barrier_cnt.fetch_add(1, Ordering::SeqCst);
-    //                 send_buf_slice[0] = barrier_id;
-    //                 let barrier_slice = &[barrier_id];
-    //                 // println!(
-    //                 //     "[{:?}] barrier_id = {:?}",
-    //                 //     std::thread::current().id(),
-    //                 //     barrier_id
-    //                 // );
+    //                     let barrier_id = self.barrier_cnt.fetch_add(1, Ordering::SeqCst);
+    //                     send_buf_slice[0] = barrier_id;
+    //                     let barrier_slice = &[barrier_id];
 
-    //                 for round in 0..self.num_rounds {
-    //                     for i in 1..=self.n {
-    //                         let team_send_pe =
-    //                             (my_index + i * (self.n + 1).pow(round as u32)) % self.num_pes;
-    //                         if team_send_pe != my_index {
-    //                             let send_pe = self.arch.single_iter(team_send_pe).next().unwrap();
-    //                             // println!(
-    //                             //     "[{:?}][ {:?} {:?}] round: {:?}  i: {:?} sending to [{:?} ({:?}) ] id: {:?} buf {:?}",
-    //                             //     std::thread::current().id(),
-    //                             //     self.my_pe,
-    //                             //     my_index,
-    //                             //     round,
-    //                             //     i,
-    //                             //     send_pe,
-    //                             //     team_send_pe,
-    //                             //     send_buf_slice,
-    //                             //     unsafe {
-    //                             //         self.barrier_buf[i - 1]
-    //                             //             .as_mut_slice()
-    //                             //             .expect("Data should exist on PE")
-    //                             //     }
-    //                             // );
-    //                             // println!("barrier put_slice 2");
-    //                             unsafe {
-    //                                 self.barrier_buf[i - 1].put_slice(
-    //                                     send_pe,
-    //                                     round,
-    //                                     barrier_slice,
-    //                                 );
-    //                                 //safe as we are the only ones writing to our index
-    //                             }
-    //                         }
-    //                     }
-    //                     for i in 1..=self.n {
-    //                         let team_recv_pe = ((my_index as isize
-    //                             - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
-    //                             as isize)
-    //                             .rem_euclid(self.num_pes as isize)
-    //                             as isize;
-    //                         let recv_pe =
-    //                             self.arch.single_iter(team_recv_pe as usize).next().unwrap();
-    //                         if team_recv_pe as usize != my_index {
-    //                             // println!(
-    //                             //     "[{:?}][{:?} ] recv from [{:?} ({:?}) ] id: {:?} buf {:?}",
-    //                             //     std::thread::current().id(),
-    //                             //     self.my_pe,
-    //                             //     recv_pe,
-    //                             //     team_recv_pe,
-    //                             //     send_buf_slice,
-    //                             //     unsafe {
-    //                             //         self.barrier_buf[i - 1]
-    //                             //             .as_mut_slice()
-    //                             //             .expect("Data should exist on PE")
-    //                             //     }
-    //                             // );
-    //                             unsafe {
-    //                                 //safe as  each pe is only capable of writing to its own index
-    //                                 while self.barrier_buf[i - 1]
-    //                                     .as_mut_slice()
-    //                                     .expect("Data should exist on PE")[round]
-    //                                     < barrier_id
-    //                                 {
-    //                                     self.barrier_timeout(
-    //                                         &mut s,
-    //                                         my_index,
+    //                     for round in 0..self.num_rounds {
+    //                         for i in 1..=self.n {
+    //                             let team_send_pe =
+    //                                 (my_index + i * (self.n + 1).pow(round as u32)) % self.num_pes;
+    //                             if team_send_pe != my_index {
+    //                                 let send_pe = self.arch.single_iter(team_send_pe).next().unwrap();
+    //                                 unsafe {
+    //                                     self.barrier_buf[i - 1].put_slice(
+    //                                         send_pe,
     //                                         round,
-    //                                         i,
-    //                                         team_recv_pe,
-    //                                         recv_pe,
-    //                                         send_buf_slice,
+    //                                         barrier_slice,
     //                                     );
-    //                                     self.lamellae.flush();
-    //                                     async_std::task::yield_now().await;
+    //                                     //safe as we are the only ones writing to our index
+    //                                 }
+    //                             }
+    //                         }
+    //                         for i in 1..=self.n {
+    //                             let team_recv_pe = ((my_index as isize
+    //                                 - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
+    //                                 as isize)
+    //                                 .rem_euclid(self.num_pes as isize)
+    //                                 as isize;
+    //                             let recv_pe =
+    //                                 self.arch.single_iter(team_recv_pe as usize).next().unwrap();
+    //                             if team_recv_pe as usize != my_index {
+    //                                 unsafe {
+    //                                     //safe as  each pe is only capable of writing to its own index
+    //                                     while self.barrier_buf[i - 1]
+    //                                         .as_mut_slice()
+    //                                         .expect("Data should exist on PE")[round]
+    //                                         < barrier_id
+    //                                     {
+    //                                         self.barrier_timeout(
+    //                                             &mut s,
+    //                                             my_index,
+    //                                             round,
+    //                                             i,
+    //                                             team_recv_pe,
+    //                                             recv_pe,
+    //                                             send_buf_slice,
+    //                                         );
+    //                                         self.lamellae.flush();
+    //                                         async_std::task::yield_now().await;
+    //                                     }
     //                                 }
     //                             }
     //                         }
@@ -441,7 +407,6 @@ impl Barrier {
     //             }
     //         }
     //     }
-    // }
 }
 
 // impl Drop for Barrier {
@@ -473,6 +438,7 @@ enum State {
 
 impl BarrierHandle {
     fn do_send_round(&self, round: usize) {
+        // println!("do send round {:?}", round);
         let barrier_slice = &[self.barrier_id];
         for i in 1..=self.n {
             let team_send_pe = (self.my_index + i * (self.n + 1).pow(round as u32)) % self.num_pes;
@@ -487,6 +453,7 @@ impl BarrierHandle {
     }
 
     fn do_recv_round(&self, round: usize, recv_pe_index: usize) -> Option<usize> {
+        // println!("do recv round {:?}", round);
         for i in recv_pe_index..=self.n {
             let team_recv_pe = ((self.my_index as isize
                 - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
@@ -502,7 +469,7 @@ impl BarrierHandle {
                         < self.barrier_id
                     {
                         self.lamellae.flush();
-                        return Some(recv_pe);
+                        return Some(i);
                     }
                 }
             }
@@ -521,6 +488,7 @@ impl Future for BarrierHandle {
                 while round < self.num_rounds {
                     self.do_send_round(round);
                     if let Some(recv_pe) = self.do_recv_round(round, 1) {
+                        // println!("waiting for pe {:?}", recv_pe);
                         *self.project().state = State::RoundInProgress(round, recv_pe);
                         cx.waker().wake_by_ref();
                         return Poll::Pending;
@@ -533,13 +501,16 @@ impl Future for BarrierHandle {
             State::RoundInProgress(round, recv_pe) => {
                 let mut round = round;
                 if let Some(recv_pe) = self.do_recv_round(round, recv_pe) {
+                    // println!("waiting for pe {:?}", recv_pe);
                     *self.project().state = State::RoundInProgress(round, recv_pe);
                     cx.waker().wake_by_ref();
                     return Poll::Pending;
                 }
                 round += 1;
                 while round < self.num_rounds {
+                    self.do_send_round(round);
                     if let Some(recv_pe) = self.do_recv_round(round, 1) {
+                        // println!("waiting for pe {:?}", recv_pe);
                         *self.project().state = State::RoundInProgress(round, recv_pe);
                         cx.waker().wake_by_ref();
                         return Poll::Pending;
diff --git a/src/darc.rs b/src/darc.rs
index 1820974e..b4b36c0b 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -65,6 +65,7 @@ use crate::env_var::config;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
+use crate::scheduler::LamellarTask;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 /// prelude for the darc module
@@ -401,7 +402,7 @@ impl<T: 'static> DarcInner<T> {
         unsafe { &(*self.item) }
     }
 
-    fn send_finished(&self) -> Vec<AmHandle<()>> {
+    fn send_finished(&self) -> Vec<LamellarTask<()>> {
         let ref_cnts = unsafe {
             std::slice::from_raw_parts_mut(self.ref_cnt_addr as *mut AtomicUsize, self.num_pes)
         };
@@ -425,15 +426,18 @@ impl<T: 'static> DarcInner<T> {
                 //     my_addr
                 // );
                 // println!("[{:?}] {:?}", std::thread::current().id(), self);
-                reqs.push(team.exec_am_pe_tg(
-                    pe,
-                    FinishedAm {
-                        cnt: cnt,
-                        src_pe: pe,
-                        inner_addr: pe_addr,
-                    },
-                    Some(self.am_counters()),
-                ));
+                reqs.push(
+                    team.exec_am_pe_tg(
+                        pe,
+                        FinishedAm {
+                            cnt: cnt,
+                            src_pe: pe,
+                            inner_addr: pe_addr,
+                        },
+                        Some(self.am_counters()),
+                    )
+                    .spawn(),
+                );
             }
         }
         reqs
@@ -526,7 +530,7 @@ impl<T: 'static> DarcInner<T> {
         let rdma = &team.lamellae;
         for pe in team.arch.team_iter() {
             // println!("darc block_on_outstanding put 3");
-            rdma.put(
+            rdma.iput(
                 pe,
                 &mode_refs[inner.my_pe..=inner.my_pe],
                 inner.mode_addr + inner.my_pe * std::mem::size_of::<DarcMode>(),
@@ -627,9 +631,18 @@ impl<T: 'static> DarcInner<T> {
                 while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
                     async_std::task::yield_now().await;
                 }
+                // println!("before send finished");
                 join_all(inner.send_finished()).await;
+                // println!("after send finished");
                 let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
                 barrier_fut.await;
+                // println!("after barrier2");
+                // println!(
+                //     "[{:?}].0 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
 
                 let mut old_ref_cnts = ref_cnts_slice.to_vec();
                 let old_local_cnt = inner.total_local_cnt.load(Ordering::SeqCst);
@@ -669,6 +682,12 @@ impl<T: 'static> DarcInner<T> {
                 rdma.flush();
                 let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
                 barrier_fut.await;
+                // println!(
+                //     "[{:?}].1 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
                 outstanding_refs |= old_local_cnt != inner.total_local_cnt.load(Ordering::SeqCst);
                 // if outstanding_refs {
                 //     println!(
@@ -705,13 +724,38 @@ impl<T: 'static> DarcInner<T> {
                 //         old_ref_cnts
                 //     );
                 // }
+                // println!(
+                //     "[{:?}].2 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
                 if outstanding_refs {
                     // println!("reseting barrier_id");
                     barrier_id = 0;
                 }
+                // println!(
+                //     "[{:?}].3 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
                 rdma.flush();
                 let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
                 barrier_fut.await;
+                // println!(
+                //     "[{:?}].4 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
+
+                let barrier_id_slice = unsafe {
+                    std::slice::from_raw_parts_mut(
+                        &mut barrier_id as *mut usize as *mut u8,
+                        std::mem::size_of::<usize>(),
+                    )
+                };
 
                 for pe in 0..inner.num_pes {
                     let send_pe = team.arch.single_iter(pe).next().unwrap();
@@ -719,12 +763,7 @@ impl<T: 'static> DarcInner<T> {
                     //     "[{:?}] {rel_addr:x} sending {barrier_id} ({barrier_id_slice:?}) to pe {pe} ",
                     //     std::thread::current().id(),
                     // );
-                    let barrier_id_slice = unsafe {
-                        std::slice::from_raw_parts_mut(
-                            &mut barrier_id as *mut usize as *mut u8,
-                            std::mem::size_of::<usize>(),
-                        )
-                    };
+
                     // println!("darc block_on_outstanding put 2");
                     rdma.iput(
                         send_pe,
@@ -734,15 +773,31 @@ impl<T: 'static> DarcInner<T> {
                 }
                 //maybe we need to change the above to a get?
                 rdma.flush();
+                // println!(
+                //     "[{:?}].5 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
                 let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
                 barrier_fut.await;
+                // println!(
+                //     "[{:?}].6 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
                 for id in &*barrier_slice {
                     outstanding_refs |= *id == 0;
                 }
                 // if outstanding_refs {
                 //     println!("[{:?}] {rel_addr:x}  not all pes ready mode_refs: {mode_refs:?} prev_ref_cnts: {prev_ref_cnts:?} barrier_id: {barrier_id:?} barrier_id_slice: {barrier_id_slice:?} barrier_ref_cnt_slice: {barrier_ref_cnt_slice:?}
                 //     barrier_slice: {barrier_slice:?} ref_cnts_slice: {ref_cnts_slice:?} old_ref_cnts: {old_ref_cnts:?} old_local_cnt: {old_local_cnt:?} local_cnt: {:?} old_dist_cnt: {old_dist_cnt:?} dist_cnt: {:?}
-                //     dist_cnts_changed: {dist_cnts_changed:?} barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(),inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
+                //     barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(),inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
+                // } else {
+                //     println!("[{:?}] {rel_addr:x} i think all pes ready! mode_refs: {mode_refs:?} prev_ref_cnts: {prev_ref_cnts:?} barrier_id: {barrier_id:?} barrier_id_slice: {barrier_id_slice:?} barrier_ref_cnt_slice: {barrier_ref_cnt_slice:?}
+                //     barrier_slice: {barrier_slice:?} ref_cnts_slice: {ref_cnts_slice:?} old_ref_cnts: {old_ref_cnts:?} old_local_cnt: {old_local_cnt:?} local_cnt: {:?} old_dist_cnt: {old_dist_cnt:?} dist_cnt: {:?}
+                //     barrier_sum: {barrier_sum:?} old_barrier_id: {old_barrier_id:?} ", std::thread::current().id(),inner.total_local_cnt.load(Ordering::SeqCst), inner.total_dist_cnt.load(Ordering::SeqCst));
                 // }
                 // if dist_cnts_changed || !outstanding_refs {
                 //     println!("[{:?}] {rel_addr:x}  mode_refs: {mode_refs:?} prev_ref_cnts: {prev_ref_cnts:?} barrier_id: {barrier_id:?} barrier_id_slice: {barrier_id_slice:?} barrier_ref_cnt_slice: {barrier_ref_cnt_slice:?}
@@ -760,13 +815,31 @@ impl<T: 'static> DarcInner<T> {
                 //     async_std::task::yield_now().await;
                 // }
                 prev_ref_cnts = old_ref_cnts;
+                // println!(
+                //     "[{:?}].7 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
                 let barrier_fut = unsafe { inner.barrier.as_ref().unwrap().async_barrier() };
                 barrier_fut.await;
+                // println!(
+                //     "[{:?}].8 barrier id = {:?} barrier_slice = {:?}",
+                //     std::thread::current().id(),
+                //     barrier_id,
+                //     barrier_slice
+                // );
             }
             // println!(
-            //     "[{:?}] {rel_addr:x}  all outstanding refs are resolved",
+            //     "[{:?}]  all outstanding refs are resolved",
             //     std::thread::current().id()
             // );
+            // println!(
+            //     "[{:?}].9 barrier id = {:?} barrier_slice = {:?}",
+            //     std::thread::current().id(),
+            //     barrier_id,
+            //     barrier_slice
+            // );
             // inner.debug_print();
             // println!("[{:?}] {:?}", std::thread::current().id(), inner);
             Self::broadcast_state(inner.clone(), team.clone(), mode_refs, state as u8);
@@ -1591,14 +1664,16 @@ macro_rules! launch_drop {
             );
         }
         // team.print_cnt();
-        team.exec_am_local(DroppedWaitAM {
-            inner_addr: $inner_addr as *const u8 as usize,
-            mode_addr: $inner.mode_addr,
-            my_pe: $inner.my_pe,
-            num_pes: $inner.num_pes,
-            team: team.clone(),
-            phantom: PhantomData::<T>,
-        });
+        let _ = team
+            .exec_am_local(DroppedWaitAM {
+                inner_addr: $inner_addr as *const u8 as usize,
+                mode_addr: $inner.mode_addr,
+                my_pe: $inner.my_pe,
+                num_pes: $inner.num_pes,
+                team: team.clone(),
+                phantom: PhantomData::<T>,
+            })
+            .spawn();
     };
 }
 
@@ -1620,6 +1695,9 @@ impl<T: 'static> Drop for Darc<T> {
             if pe_ref_cnts.iter().any(|&x| x > 0) {
                 //if we have received and accesses from remote pes, send we are finished
                 inner.send_finished();
+                // .into_iter().for_each(|x| {
+                //     let _ = x.spawn();
+                // });
             }
         }
         // println!("in drop");
@@ -1768,6 +1846,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                 }
             }
         }
+        // println!("after DarcMode::Dropped");
         // let inner =self.inner_addr as *mut DarcInner<T>;
         let wrapped = WrappedInner {
             inner: NonNull::new(self.inner_addr as *mut DarcInner<T>)
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index f483b21d..7225333a 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -288,15 +288,17 @@ impl<T> Drop for GlobalRwDarcReadGuard<T> {
                 0,
                 inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
             );
-            team.exec_am_pe_tg(
-                0,
-                UnlockAm {
-                    rwlock_addr: remote_rwlock_addr,
-                    orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                    lock_type: LockType::Read,
-                },
-                Some(inner.am_counters()),
-            );
+            let _ = team
+                .exec_am_pe_tg(
+                    0,
+                    UnlockAm {
+                        rwlock_addr: remote_rwlock_addr,
+                        orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                        lock_type: LockType::Read,
+                    },
+                    Some(inner.am_counters()),
+                )
+                .spawn();
         }
     }
 }
@@ -337,15 +339,17 @@ impl<T> Drop for GlobalRwDarcWriteGuard<T> {
             0,
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        team.exec_am_pe_tg(
-            0,
-            UnlockAm {
-                rwlock_addr: remote_rwlock_addr,
-                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                lock_type: LockType::Write,
-            },
-            Some(inner.am_counters()),
-        );
+        let _ = team
+            .exec_am_pe_tg(
+                0,
+                UnlockAm {
+                    rwlock_addr: remote_rwlock_addr,
+                    orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                    lock_type: LockType::Write,
+                },
+                Some(inner.am_counters()),
+            )
+            .spawn();
     }
 }
 
@@ -385,15 +389,17 @@ impl<T> Drop for GlobalRwDarcCollectiveWriteGuard<T> {
             0,
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        team.exec_am_pe_tg(
-            0,
-            UnlockAm {
-                rwlock_addr: remote_rwlock_addr,
-                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                lock_type: LockType::CollectiveWrite(self.collective_cnt),
-            },
-            Some(inner.am_counters()),
-        );
+        let _ = team
+            .exec_am_pe_tg(
+                0,
+                UnlockAm {
+                    rwlock_addr: remote_rwlock_addr,
+                    orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                    lock_type: LockType::CollectiveWrite(self.collective_cnt),
+                },
+                Some(inner.am_counters()),
+            )
+            .spawn();
     }
 }
 
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index a50be34a..920e70c1 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -2142,7 +2142,7 @@ impl LamellarTeamRT {
 impl Drop for LamellarTeamRT {
     //#[tracing::instrument(skip_all)]
     fn drop(&mut self) {
-        println!("LamellarTeamRT Drop");
+        // println!("LamellarTeamRT Drop");
         // println!("sechduler_new: {:?}", Arc::strong_count(&self.scheduler));
         // println!("lamellae: {:?}", Arc::strong_count(&self.lamellae));
         // println!("arch: {:?}", Arc::strong_count(&self.arch));
diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index 4890d0ea..a299ed05 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -215,14 +215,22 @@ impl Drop for MemRegionHandle {
                             parent_id: self.inner.grand_parent_id,
                         };
                         // println!("sending finished am {:?} pe: {:?}",temp, self.inner.parent_id.1);
-                        self.inner.team.exec_am_pe(self.inner.parent_id.1, temp);
+                        let _ = self
+                            .inner
+                            .team
+                            .exec_am_pe(self.inner.parent_id.1, temp)
+                            .spawn();
                     }
                 }
             } else {
                 //need to wait for references I sent to return
-                self.inner.team.exec_am_local(MemRegionDropWaitAm {
-                    inner: self.inner.clone(),
-                });
+                let _ = self
+                    .inner
+                    .team
+                    .exec_am_local(MemRegionDropWaitAm {
+                        inner: self.inner.clone(),
+                    })
+                    .spawn();
             }
         }
     }
@@ -285,7 +293,11 @@ impl LamellarAM for MemRegionDropWaitAm {
                                 parent_id: self.inner.grand_parent_id,
                             };
                             // println!("waited sending finished am {:?} pe: {:?}",temp, self.inner.parent_id.1);
-                            self.inner.team.exec_am_pe(self.inner.parent_id.1, temp);
+                            let _ = self
+                                .inner
+                                .team
+                                .exec_am_pe(self.inner.parent_id.1, temp)
+                                .spawn();
                         }
                     }
                     break;

From b680d554efd8b2fdf91d0727a93b3f60ec51900e Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Sun, 28 Jul 2024 16:33:46 -0700
Subject: [PATCH 063/116] fix barrier_handle state machine indexing

---
 src/array.rs | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/array.rs b/src/array.rs
index 2813fe54..35b19562 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -191,25 +191,25 @@ pub struct ReduceKey {
 crate::inventory::collect!(ReduceKey);
 
 // impl Dist for bool {}
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
-lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
-lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
-
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+
+lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
 lamellar_impl::generate_ops_for_bool_rt!();
 

From 28cf0f93a4e58515f7699744905ebb39479ad811 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 29 Jul 2024 11:38:57 -0700
Subject: [PATCH 064/116] update iterator consumers to properly trigger task
 counts so wait all  works correctly

---
 .../array_consumer_schedules.rs               |  9 ++++++-
 src/array/unsafe/iteration/distributed.rs     | 27 ++++++++++++++-----
 src/barrier.rs                                |  6 ++++-
 src/darc.rs                                   |  2 +-
 src/scheduler.rs                              |  3 +--
 5 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/examples/array_examples/array_consumer_schedules.rs b/examples/array_examples/array_consumer_schedules.rs
index 23040adb..f54b9890 100644
--- a/examples/array_examples/array_consumer_schedules.rs
+++ b/examples/array_examples/array_consumer_schedules.rs
@@ -107,15 +107,22 @@ fn sum_with_schedule(
 
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
+    println!("world created");
     let _my_pe = world.my_pe();
     let _num_pes = world.num_pes();
     let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    println!("array created");
+    block_array.print();
     let _ = block_array
         .dist_iter_mut()
         .enumerate()
-        .for_each(move |(i, e)| e.store(i))
+        .for_each(move |(i, e)| {
+            println!("setting {i} to {i}");
+            e.store(i)
+        })
         .spawn();
     world.wait_all();
+    println!("Done");
     block_array.print();
 
     let thread_cnts: Arc<Mutex<HashMap<ThreadId, usize>>> = Arc::new(Mutex::new(HashMap::new()));
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 6f527564..ad9c3cd8 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -11,6 +11,7 @@ use core::marker::PhantomData;
 use futures_util::Future;
 use paste::paste;
 use std::pin::Pin;
+use std::sync::atomic::Ordering;
 use std::sync::Arc;
 
 impl<T> InnerArray for UnsafeArray<T> {
@@ -48,15 +49,27 @@ macro_rules! consumer_impl {
                 $($bounds)+
             {
                 let am = $($am)*;
+                // set req counters so that wait all works
+                self.data.team.team_counters.add_send_req(1);
+                self.data.team.world_counters.add_send_req(1);
+                self.data.task_group.counters.add_send_req(1);
+
                 let barrier = self.barrier_handle();
                 let inner = self.clone();
-                let reqs_future = Box::pin(async move{match sched {
-                    Schedule::Static => inner.sched_static(am),
-                    Schedule::Dynamic => inner.sched_dynamic(am),
-                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
-                    Schedule::Guided => inner.sched_guided(am),
-                    Schedule::WorkStealing => inner.sched_work_stealing(am),
-                }});
+                let reqs_future = Box::pin(async move{
+                    let reqs = match sched {
+                        Schedule::Static => inner.sched_static(am),
+                        Schedule::Dynamic => inner.sched_dynamic(am),
+                        Schedule::Chunk(size) => inner.sched_chunk(am,size),
+                        Schedule::Guided => inner.sched_guided(am),
+                        Schedule::WorkStealing => inner.sched_work_stealing(am),
+                    };
+                    // remove req counters after individual ams have been launched.
+                    inner.data.team.team_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+                    inner.data.team.world_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+                    inner.data.task_group.counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+                    reqs
+                });
                 $return_type::new(barrier,reqs_future,self)
             }
 
diff --git a/src/barrier.rs b/src/barrier.rs
index 8c30538d..b1e95b4e 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -297,6 +297,7 @@ impl Barrier {
     // but for the case of Darcs, or any case where the barrier is being called in a worker thread
     // we actually want to be able to process other tasks while the barrier is active
     pub(crate) fn tasking_barrier(&self) {
+        // println!("calling tasking barrier");
         self.barrier_internal(|| {
             self.scheduler.exec_task();
         });
@@ -314,10 +315,13 @@ impl Barrier {
             n: self.n,
             state: State::RoundInit(self.num_rounds),
         };
+        // println!("in barrier handle");
+        // self.print_bar();
         if self.panic.load(Ordering::SeqCst) == 0 {
             if let Some(_) = &self.send_buf {
                 if let Ok(my_index) = self.arch.team_pe(self.my_pe) {
                     let barrier_id = self.barrier_cnt.fetch_add(1, Ordering::SeqCst);
+                    // println!("barrier id: {:?}", barrier_id);
                     handle.barrier_id = barrier_id;
                     handle.my_index = my_index;
                     handle.state = State::RoundInit(0);
@@ -459,7 +463,7 @@ impl BarrierHandle {
                 - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
                 as isize)
                 .rem_euclid(self.num_pes as isize) as isize;
-            let recv_pe = self.arch.single_iter(team_recv_pe as usize).next().unwrap();
+            // let recv_pe = self.arch.single_iter(team_recv_pe as usize).next().unwrap();
             if team_recv_pe as usize != self.my_index {
                 unsafe {
                     //safe as  each pe is only capable of writing to its own index
diff --git a/src/darc.rs b/src/darc.rs
index b4b36c0b..5efcbed7 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -59,7 +59,7 @@ use std::sync::Arc;
 
 // //use tracing::*;
 
-use crate::active_messaging::{AMCounters, AmHandle, RemotePtr};
+use crate::active_messaging::{AMCounters, RemotePtr};
 use crate::barrier::Barrier;
 use crate::env_var::config;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
diff --git a/src/scheduler.rs b/src/scheduler.rs
index e3983cd3..fbc1cd36 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -542,13 +542,12 @@ impl Scheduler {
         while self.panic.load(Ordering::SeqCst) == 0
             && self.num_tasks.load(Ordering::Relaxed) > 3
             && self.num_ams.load(Ordering::Relaxed) > 0
-        {}
         //TODO maybe this should be > 2
         {
             //the Lamellae Comm Task, Lamellae Alloc Task, Lamellar Error Task
             if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
-                    "shurtdown timeout, tasks remaining: {:?} panic: {:?}",
+                    "shutdown timeout, tasks remaining: {:?} panic: {:?}",
                     self.num_tasks.load(Ordering::Relaxed),
                     self.panic.load(Ordering::SeqCst),
                 );

From acd2bd33425fc13fb9aa28d7b644380cc36b8d27 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 30 Jul 2024 17:00:07 -0700
Subject: [PATCH 065/116] enforce ordered barriers

---
 examples/kernels/dft_proxy.rs                 |   2 +
 .../safe_parallel_blocked_array_gemm.rs       |   4 +-
 src/active_messaging/handle.rs                |   4 +-
 src/array.rs                                  |   1 +
 src/array/global_lock_atomic/iteration.rs     |   2 +-
 .../distributed_iterator/consumer/for_each.rs |  51 +-
 src/array/local_lock_atomic/iteration.rs      |   2 +-
 src/array/unsafe/iteration/distributed.rs     |   6 +
 src/barrier.rs                                |  94 +++-
 src/lamellar_request.rs                       | 519 +-----------------
 src/lamellar_task_group.rs                    |  30 +-
 src/scheduler.rs                              |  11 +
 12 files changed, 187 insertions(+), 539 deletions(-)

diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index e6b56040..4f98e80d 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -842,7 +842,9 @@ fn main() {
                     .for_each(|elem| *elem = 0.0)
                     .block();
             }
+            println!("here 0");
             full_spectrum_array.wait_all();
+            println!("here 1");
             full_spectrum_array.barrier();
             times[ti].push(dft_lamellar_array_opt_test(
                 full_signal_array.clone(),
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index 29b5bdf1..e8847bc3 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -36,7 +36,7 @@ fn main() {
     let a_init = a
         .dist_iter_mut()
         .enumerate()
-        .for_each(|(i, x)| *x = i as f32);
+        .for_each(move |(i, x)| *x = i as f32);
     let b_init = b.dist_iter_mut().enumerate().for_each(move |(i, x)| {
         //identity matrix
         let row = i / dim;
@@ -47,7 +47,7 @@ fn main() {
             *x = 0 as f32;
         }
     });
-    let c_init = c.dist_iter_mut().for_each(|x| *x = 0.0);
+    let c_init = c.dist_iter_mut().for_each(move |x| *x = 0.0);
     world.block_on_all([a_init, b_init, c_init]);
     let a = a.into_read_only();
     let b = b.into_read_only();
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index bcec2722..c4797ccc 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -55,7 +55,7 @@ impl LamellarRequestAddResult for AmHandleInner {
             waker.wake();
         }
     }
-    fn update_counters(&self) {
+    fn update_counters(&self, _sub_id: usize) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
@@ -344,7 +344,7 @@ impl LamellarRequestAddResult for MultiAmHandleInner {
             }
         }
     }
-    fn update_counters(&self) {
+    fn update_counters(&self, _sub_id: usize) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
diff --git a/src/array.rs b/src/array.rs
index 35b19562..8add8161 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -193,6 +193,7 @@ crate::inventory::collect!(ReduceKey);
 // impl Dist for bool {}
 // lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
 // lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index 193984f6..a10d2376 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -389,7 +389,7 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
             self.array
                 .block_on(async move { lock.collective_write().await }),
         );
-        self.barrier();
+        // self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         GlobalLockDistIterMut {
             data: self.clone(),
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index b893e82a..17a2b6e8 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -226,6 +226,14 @@ impl DistIterForEachHandle {
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<()> {
+        // match self.state {
+        //     State::Barrier(ref barrier, _) => {
+        //         println!("spawning task barrier id {:?}", barrier.barrier_id);
+        //     }
+        //     State::Reqs(_, barrier_id) => {
+        //         println!("spawning task not sure I can be here {:?}", barrier_id);
+        //     }
+        // }
         self.team.clone().scheduler.spawn_task(self)
     }
 }
@@ -236,7 +244,7 @@ enum State {
         #[pin] BarrierHandle,
         Pin<Box<dyn Future<Output = InnerDistIterForEachHandle> + Send>>,
     ),
-    Reqs(#[pin] InnerDistIterForEachHandle),
+    Reqs(#[pin] InnerDistIterForEachHandle, usize),
 }
 
 impl Future for DistIterForEachHandle {
@@ -245,19 +253,42 @@ impl Future for DistIterForEachHandle {
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Barrier(barrier, inner) => {
+                let barrier_id = barrier.barrier_id;
+                // println!("in task barrier {:?}", barrier_id);
                 ready!(barrier.poll(cx));
-                let mut inner = ready!(Future::poll(inner.as_mut(), cx));
+                // println!("past barrier {:?}", barrier_id);
+                let mut inner: InnerDistIterForEachHandle =
+                    ready!(Future::poll(inner.as_mut(), cx));
+
                 match Pin::new(&mut inner).poll(cx) {
-                    Poll::Ready(()) => Poll::Ready(()),
+                    Poll::Ready(()) => {
+                        // println!("past reqs  barrier_id {:?}", barrier_id);
+                        Poll::Ready(())
+                    }
                     Poll::Pending => {
-                        *this.state = State::Reqs(inner);
+                        // println!(
+                        //     "reqs remaining {:?} barrier_id {:?}",
+                        //     inner.reqs.len(),
+                        //     barrier_id
+                        // );
+                        *this.state = State::Reqs(inner, barrier_id);
                         Poll::Pending
                     }
                 }
             }
-            StateProj::Reqs(inner) => {
-                ready!(inner.poll(cx));
-                Poll::Ready(())
+            StateProj::Reqs(inner, barrier_id) => {
+                // println!(
+                //     "reqs remaining {:?} barrier_id {:?}",
+                //     inner.reqs.len(),
+                //     barrier_id
+                // );
+                match inner.poll(cx) {
+                    Poll::Ready(()) => {
+                        // println!("past reqs barrier_id {:?}", barrier_id);
+                        Poll::Ready(())
+                    }
+                    Poll::Pending => Poll::Pending,
+                }
             }
         }
     }
@@ -271,7 +302,7 @@ impl LamellarRequest for DistIterForEachHandle {
                 barrier.blocking_wait();
                 self.team.block_on(reqs).blocking_wait();
             }
-            State::Reqs(inner) => {
+            State::Reqs(inner, _) => {
                 inner.blocking_wait();
             }
         }
@@ -285,7 +316,7 @@ impl LamellarRequest for DistIterForEachHandle {
                 waker.wake_by_ref();
                 false
             }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
+            State::Reqs(inner, _) => inner.ready_or_set_waker(waker),
         }
     }
     fn val(&self) -> Self::Output {
@@ -293,7 +324,7 @@ impl LamellarRequest for DistIterForEachHandle {
             State::Barrier(_barrier, _reqs) => {
                 unreachable!("should never be in barrier state when val is called");
             }
-            State::Reqs(inner) => inner.val(),
+            State::Reqs(inner, _) => inner.val(),
         }
     }
 }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 1ad958f5..6e0c6bbf 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -399,7 +399,7 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     fn dist_iter_mut(&self) -> Self::DistIter {
         let lock: LocalRwDarc<()> = self.lock.clone();
         let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
-        self.barrier();
+        // self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         LocalLockDistIterMut {
             data: self.clone(),
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index ad9c3cd8..69d4df74 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -54,9 +54,14 @@ macro_rules! consumer_impl {
                 self.data.team.world_counters.add_send_req(1);
                 self.data.task_group.counters.add_send_req(1);
 
+                // self.data.team.scheduler.print_status();
                 let barrier = self.barrier_handle();
+                // let barrier_id  = barrier.barrier_id;
+                // println!("barrier_id {:?} creating dist iter handle",barrier_id);
                 let inner = self.clone();
                 let reqs_future = Box::pin(async move{
+
+                    // println!("barrier id {:?} entering dist iter sched {:?} {:?} {:?}",barrier_id, inner.data.team.team_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.team.world_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.task_group.counters.outstanding_reqs.load(Ordering::SeqCst));
                     let reqs = match sched {
                         Schedule::Static => inner.sched_static(am),
                         Schedule::Dynamic => inner.sched_dynamic(am),
@@ -68,6 +73,7 @@ macro_rules! consumer_impl {
                     inner.data.team.team_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
                     inner.data.team.world_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
                     inner.data.task_group.counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+                    // println!("barrier id {:?} done with dist iter sched {:?} {:?} {:?}",barrier_id,inner.data.team.team_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.team.world_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.task_group.counters.outstanding_reqs.load(Ordering::SeqCst));
                     reqs
                 });
                 $return_type::new(barrier,reqs_future,self)
diff --git a/src/barrier.rs b/src/barrier.rs
index b1e95b4e..c73f04e1 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -22,6 +22,7 @@ pub(crate) struct Barrier {
     pub(crate) scheduler: Arc<Scheduler>,
     lamellae: Arc<Lamellae>,
     barrier_cnt: AtomicUsize,
+    cur_barrier_id: Arc<AtomicUsize>,
     barrier_buf: Arc<Vec<MemoryRegion<usize>>>,
     send_buf: Option<MemoryRegion<usize>>,
     panic: Arc<AtomicU8>,
@@ -93,6 +94,7 @@ impl Barrier {
             scheduler,
             lamellae,
             barrier_cnt: AtomicUsize::new(1),
+            cur_barrier_id: Arc::new(AtomicUsize::new(1)),
             barrier_buf: Arc::new(buffs),
             send_buf,
             panic,
@@ -186,6 +188,12 @@ impl Barrier {
                     //     std::thread::current().id(),
                     //     barrier_id
                     // );
+                    while barrier_id > self.cur_barrier_id.load(Ordering::SeqCst) {
+                        wait_func();
+                        if s.elapsed().as_secs_f64() > config().deadlock_timeout {
+                            break;
+                        }
+                    }
 
                     for round in 0..self.num_rounds {
                         for i in 1..=self.n {
@@ -265,10 +273,12 @@ impl Barrier {
                             }
                         }
                     }
+                    self.cur_barrier_id.store(barrier_id + 1, Ordering::SeqCst);
+                    // println!("leaving barrier {:?}", barrier_id);
                 }
             }
         }
-        // println!("leaving barrier");
+
         // self.print_bar();
         // self.lamellae.flush();
     }
@@ -311,6 +321,7 @@ impl Barrier {
             my_index: 0,
             num_pes: self.num_pes,
             barrier_id: 0,
+            cur_barrier_id: self.cur_barrier_id.clone(),
             num_rounds: self.num_rounds,
             n: self.n,
             state: State::RoundInit(self.num_rounds),
@@ -324,6 +335,15 @@ impl Barrier {
                     // println!("barrier id: {:?}", barrier_id);
                     handle.barrier_id = barrier_id;
                     handle.my_index = my_index;
+
+                    if barrier_id > self.cur_barrier_id.load(Ordering::SeqCst) {
+                        handle.state = State::Waiting;
+                        return handle;
+                    }
+                    // else if barrier_id < self.cur_barrier_id.load(Ordering::SeqCst) {
+                    //     println!("should this happen>?");
+                    // }
+
                     handle.state = State::RoundInit(0);
                     let mut round = 0;
                     while round < self.num_rounds {
@@ -334,6 +354,7 @@ impl Barrier {
                         }
                         round += 1;
                     }
+                    self.cur_barrier_id.store(barrier_id + 1, Ordering::SeqCst);
                     handle.state = State::RoundInit(self.num_rounds);
                 }
             }
@@ -429,13 +450,15 @@ pub struct BarrierHandle {
     lamellae: Arc<Lamellae>,
     my_index: usize,
     num_pes: usize,
-    barrier_id: usize,
+    pub(crate) barrier_id: usize,
+    cur_barrier_id: Arc<AtomicUsize>,
     num_rounds: usize,
     n: usize,
     state: State,
 }
 
 enum State {
+    Waiting,
     RoundInit(usize),              //the round we are in
     RoundInProgress(usize, usize), //the round we are in, pe we are waiting to hear from
 }
@@ -487,6 +510,18 @@ impl Future for BarrierHandle {
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         // let mut this = self.project();
         match self.state {
+            State::Waiting => {
+                if self.barrier_id > self.cur_barrier_id.load(Ordering::SeqCst) {
+                    cx.waker().wake_by_ref();
+                    return Poll::Pending;
+                }
+                // else if self.barrier_id < self.cur_barrier_id.load(Ordering::SeqCst) {
+                //     println!("barrier id is less than cur barrier id");
+                // }
+                *self.project().state = State::RoundInit(0);
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
             State::RoundInit(round) => {
                 let mut round = round;
                 while round < self.num_rounds {
@@ -499,7 +534,10 @@ impl Future for BarrierHandle {
                     }
                     round += 1;
                 }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
                 *self.project().state = State::RoundInit(round);
+
                 Poll::Ready(())
             }
             State::RoundInProgress(round, recv_pe) => {
@@ -521,7 +559,10 @@ impl Future for BarrierHandle {
                     }
                     round += 1;
                 }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
                 *self.project().state = State::RoundInit(round);
+
                 Poll::Ready(())
             }
         }
@@ -531,6 +572,26 @@ impl Future for BarrierHandle {
 impl LamellarRequest for BarrierHandle {
     fn blocking_wait(self) -> Self::Output {
         match self.state {
+            State::Waiting => {
+                while self.barrier_id > self.cur_barrier_id.load(Ordering::SeqCst) {
+                    std::thread::yield_now();
+                }
+                if self.barrier_id < self.cur_barrier_id.load(Ordering::SeqCst) {
+                    println!("barrier id is less than cur barrier id");
+                }
+                let mut round = 0;
+                while round < self.num_rounds {
+                    self.do_send_round(round);
+                    let mut recv_pe_index = 1;
+                    while let Some(recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                        recv_pe_index = recv_pe;
+                        std::thread::yield_now();
+                    }
+                    round += 1;
+                }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
+            }
             State::RoundInit(round) => {
                 let mut round = round;
                 while round < self.num_rounds {
@@ -542,6 +603,8 @@ impl LamellarRequest for BarrierHandle {
                     }
                     round += 1;
                 }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
             }
             State::RoundInProgress(round, recv_pe) => {
                 let mut round = round;
@@ -559,12 +622,15 @@ impl LamellarRequest for BarrierHandle {
                     }
                     round += 1;
                 }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
             }
         }
     }
 
     fn ready_or_set_waker(&mut self, _waker: &Waker) -> bool {
         match self.state {
+            State::Waiting => false,
             State::RoundInit(round) => {
                 if round < self.num_rounds {
                     false
@@ -584,6 +650,26 @@ impl LamellarRequest for BarrierHandle {
 
     fn val(&self) -> Self::Output {
         match self.state {
+            State::Waiting => {
+                while self.barrier_id > self.cur_barrier_id.load(Ordering::SeqCst) {
+                    std::thread::yield_now();
+                }
+                // if self.barrier_id < self.cur_barrier_id.load(Ordering::SeqCst) {
+                //     println!("barrier id is less than cur barrier id");
+                // }
+                let mut round = 0;
+                while round < self.num_rounds {
+                    self.do_send_round(round);
+                    let mut recv_pe_index = 1;
+                    while let Some(recv_pe) = self.do_recv_round(round, recv_pe_index) {
+                        recv_pe_index = recv_pe;
+                        std::thread::yield_now();
+                    }
+                    round += 1;
+                }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
+            }
             State::RoundInit(round) => {
                 let mut round = round;
                 while round < self.num_rounds {
@@ -595,6 +681,8 @@ impl LamellarRequest for BarrierHandle {
                     }
                     round += 1;
                 }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
             }
             State::RoundInProgress(round, recv_pe) => {
                 let mut round = round;
@@ -612,6 +700,8 @@ impl LamellarRequest for BarrierHandle {
                     }
                     round += 1;
                 }
+                self.cur_barrier_id
+                    .store(self.barrier_id + 1, Ordering::SeqCst);
             }
         }
     }
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 9a45d5c5..7016abaf 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -18,46 +18,6 @@ pub(crate) enum InternalResult {
     Unit,
 }
 
-// #[enum_dispatch(Future, LamellarRequest)]
-// pub(crate) enum LamellarHandle<T> {
-//     SinglePeAm(AmHandle<T>),
-// }
-// impl<T: AmDist> LamellarHandle<T> {
-//     pub fn blocking_wait(&self) -> T {
-//         match self {
-//             LamellarHandle::SinglePeAm(h) => h.blocking_wait(),
-//         }
-//     }
-
-//     pub fn ready(&self) -> bool {
-//         match self {
-//             LamellarHandle::SinglePeAm(h) => h.ready(),
-//         }
-//     }
-// }
-
-// impl<T> Future for LamellarHandle<T> {
-//     type Output = T;
-//     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-//         match self {
-//             LamellarHandle::SinglePeAm(h) => h.poll(cx),
-//         }
-//     }
-// }
-
-// pub(crate) mod private {
-//     use crate::active_messaging::handle::AmHandle;
-//     use crate::lamellar_request::LamellarHandle;
-//     use enum_dispatch::enum_dispatch;
-//     use futures_util::Future;
-//     use std::task::Waker;
-
-//     #[enum_dispatch(LamellarHandle<T>)]
-//     pub trait LamellarRequestSealed:  {
-
-//     }
-// }
-
 //#[doc(hidden)]
 // #[enum_dispatch]
 pub(crate) trait LamellarRequest: Future {
@@ -77,7 +37,7 @@ pub(crate) trait LamellarRequest: Future {
 pub(crate) trait LamellarRequestAddResult: Sync + Send {
     fn user_held(&self) -> bool;
     fn add_result(&self, pe: usize, sub_id: usize, data: InternalResult);
-    fn update_counters(&self);
+    fn update_counters(&self, sub_id: usize);
 }
 
 pub(crate) enum LamellarRequestResult {
@@ -135,7 +95,7 @@ impl LamellarRequestResult {
                 }
             }
         }
-        req.update_counters();
+        req.update_counters(sub_id);
         added
     }
 
@@ -148,478 +108,3 @@ impl LamellarRequestResult {
         }
     }
 }
-
-// #[derive(Debug)]
-// pub struct LamellarHandle<T: AmDist> {
-//     pub(crate) inner: Arc<LamellarRequestType<T>>,
-// }
-
-// impl<T: AmDist> Drop for LamellarHandle<T> {
-//     fn drop(&mut self) {
-//         self.inner.user_handle.store(false, Ordering::SeqCst);
-//     }
-// }
-
-// impl<T: AmDist> LamellarHandle<T> {
-//     fn process_result(&self, data: InternalResult) -> T {
-//         match data {
-//             InternalResult::Local(x) => {
-//                 if let Ok(result) = x.downcast::<T>() {
-//                     *result
-//                 } else {
-//                     panic!("unexpected local result  of type ");
-//                 }
-//             }
-//             InternalResult::Remote(x, darcs) => {
-//                 if let Ok(result) = x.deserialize_data::<T>() {
-//                     // we need to appropraiately set the reference counts if the returned data contains any Darcs
-//                     // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
-//                     // that the reference count is updated.
-//                     for darc in darcs {
-//                         match darc {
-//                             RemotePtr::NetworkDarc(darc) => {
-//                                 let temp: Darc<()> = darc.into();
-//                                 temp.des(Ok(0));
-//                                 temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
-//                             }
-//                             RemotePtr::NetMemRegionHandle(mr) => {
-//                                 let temp: Arc<MemRegionHandleInner> = mr.into();
-//                                 temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
-//                             }
-//                         }
-//                     }
-
-//                     result
-//                 } else {
-//                     panic!("unexpected remote result  of type ");
-//                 }
-//             }
-//             InternalResult::Unit => {
-//                 if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
-//                     *result
-//                 } else {
-//                     panic!("unexpected unit result  of type ");
-//                 }
-//             }
-//         }
-//     }
-// }
-
-// impl<T: AmDist> private::LamellarRequestSealed for LamellarHandle<T> {
-//     fn set_waker(&mut self, waker: &Waker) {
-//         self.inner.set_waker(waker);
-//     }
-
-//     fn val(&self) -> Self::Output {
-//         self.inner.val();
-//     }
-// }
-
-// impl<T: AmDist> LamellarRequest for LamellarHandle<T> {
-//     fn blocking_wait(&self) -> T {
-//         self.inner.blocking_wait()
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.inner.read()
-//     }
-// }
-
-// impl<T: AmDist> Future for LamellarHandle<T> {
-//     type Output = T;
-//     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-//         self.inner.poll(cx)
-//     }
-// }
-
-// pub(crate) struct LamellarRequestHandleInner {
-//     pub(crate) ready: AtomicBool,
-//     pub(crate) waker: Mutex<Option<Waker>>,
-//     pub(crate) data: Cell<Option<InternalResult>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
-//     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-//     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-//     pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
-//     pub(crate) scheduler: Arc<Scheduler>,
-//     pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
-// }
-// impl std::fmt::Debug for LamellarRequestHandleInner {
-//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//         write!(f, "LamellarRequestHandleInner {{ ready: {:?}, team_outstanding_reqs: {:?}  world_outstanding_reqs {:?} tg_outstanding_reqs {:?} user_handle{:?} }}", self.ready.load(Ordering::Relaxed),  self.team_outstanding_reqs.load(Ordering::Relaxed), self.world_outstanding_reqs.load(Ordering::Relaxed), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)), self.user_handle.load(Ordering::Relaxed))
-//     }
-// }
-// // we use the ready bool to protect access to the data field
-// unsafe impl Sync for LamellarRequestHandleInner {}
-
-// //#[doc(hidden)]
-// #[derive(Debug)]
-// pub struct LamellarRequestHandle<T: AmDist> {
-//     pub(crate) inner: Arc<LamellarRequestHandleInner>,
-//     pub(crate) _phantom: std::marker::PhantomData<T>,
-// }
-
-// impl<T: AmDist> Drop for LamellarRequestHandle<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn drop(&mut self) {
-//         self.inner.user_handle.store(false, Ordering::SeqCst);
-//     }
-// }
-
-// impl LamellarRequestAddResult for LamellarRequestHandleInner {
-//     //#[tracing::instrument(skip_all)]
-//     fn user_held(&self) -> bool {
-//         self.user_handle.load(Ordering::SeqCst)
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn add_result(&self, _pe: usize, _sub_id: usize, data: InternalResult) {
-//         // for a single request this is only called one time by a single runtime thread so use of the cell is safe
-//         self.data.set(Some(data));
-//         self.ready.store(true, Ordering::SeqCst);
-//         if let Some(waker) = self.waker.lock().take() {
-//             waker.wake();
-//         }
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn update_counters(&self) {
-//         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         // println!(
-//         //     "[{:?}] update counter team {} world {}",
-//         //     std::thread::current().id(),
-//         //     _team_reqs - 1,
-//         //     _world_req - 1
-//         // );
-//         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-//             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         }
-//     }
-// }
-
-// impl<T: AmDist> LamellarRequestHandle<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn process_result(&self, data: InternalResult) -> T {
-//         match data {
-//             InternalResult::Local(x) => {
-//                 if let Ok(result) = x.downcast::<T>() {
-//                     *result
-//                 } else {
-//                     panic!("unexpected local result  of type ");
-//                 }
-//             }
-//             InternalResult::Remote(x, darcs) => {
-//                 if let Ok(result) = x.deserialize_data::<T>() {
-//                     // we need to appropraiately set the reference counts if the returned data contains any Darcs
-//                     // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
-//                     // that the reference count is updated.
-//                     for darc in darcs {
-//                         match darc {
-//                             RemotePtr::NetworkDarc(darc) => {
-//                                 let temp: Darc<()> = darc.into();
-//                                 temp.des(Ok(0));
-//                                 temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
-//                             }
-//                             RemotePtr::NetMemRegionHandle(mr) => {
-//                                 let temp: Arc<MemRegionHandleInner> = mr.into();
-//                                 temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
-//                             }
-//                         }
-//                     }
-
-//                     result
-//                 } else {
-//                     panic!("unexpected remote result  of type ");
-//                 }
-//             }
-//             InternalResult::Unit => {
-//                 if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
-//                     *result
-//                 } else {
-//                     panic!("unexpected unit result  of type ");
-//                 }
-//             }
-//         }
-//     }
-// }
-
-// #[async_trait]
-// impl<T: AmDist> LamellarRequest for LamellarRequestHandle<T> {
-//     type Output = T;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         while !self.inner.ready.load(Ordering::SeqCst) {
-//             async_std::task::yield_now().await;
-//         }
-//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> T {
-//         while !self.inner.ready.load(Ordering::SeqCst) {
-//             // std::thread::yield_now();
-//             self.inner.scheduler.exec_task();
-//         }
-//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
-//     }
-
-//     fn ready(&self) -> bool {
-//         self.inner.ready.load(Ordering::SeqCst)
-//     }
-
-//     fn set_waker(&mut self, waker: &Waker) {
-//         *self.inner.waker.lock() = Some(waker);
-//     }
-// }
-
-// #[derive(Debug)]
-// pub(crate) struct LamellarMultiRequestHandleInner {
-//     pub(crate) cnt: AtomicUsize,
-//     pub(crate) arch: Arc<LamellarArchRT>,
-//     pub(crate) data: Mutex<HashMap<usize, InternalResult>>,
-//     pub(crate) waker: Mutex<Option<Waker>>,
-//     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-//     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-//     pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
-//     pub(crate) scheduler: Arc<Scheduler>,
-//     pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
-// }
-
-// //#[doc(hidden)]
-// #[derive(Debug)]
-// pub struct LamellarMultiRequestHandle<T: AmDist> {
-//     pub(crate) inner: Arc<LamellarMultiRequestHandleInner>,
-//     pub(crate) _phantom: std::marker::PhantomData<T>,
-// }
-
-// impl<T: AmDist> Drop for LamellarMultiRequestHandle<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn drop(&mut self) {
-//         self.inner.user_handle.store(false, Ordering::SeqCst);
-//     }
-// }
-
-// impl LamellarRequestAddResult for LamellarMultiRequestHandleInner {
-//     //#[tracing::instrument(skip_all)]
-//     fn user_held(&self) -> bool {
-//         self.user_handle.load(Ordering::SeqCst)
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn add_result(&self, pe: usize, _sub_id: usize, data: InternalResult) {
-//         let pe = self.arch.team_pe(pe).expect("pe does not exist on team");
-//         self.data.lock().insert(pe, data);
-//         self.cnt.fetch_sub(1, Ordering::SeqCst);
-//         if self.cnt.load(Ordering::SeqCst) == 0 {
-//             if let Some(waker) = self.waker.lock().take() {
-//                 waker.wake();
-//             }
-//         }
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn update_counters(&self) {
-//         // println!(
-//         //     "update counter {:?} {:?}",
-//         //     self.team_outstanding_reqs.load(Ordering::SeqCst),
-//         //     self.world_outstanding_reqs.load(Ordering::SeqCst)
-//         // );
-//         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         // println!(
-//         //     "[{:?}] multi update counter team {} world {}",
-//         //     std::thread::current().id(),
-//         //     _team_reqs - 1,
-//         //     _world_req - 1
-//         // );
-//         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-//             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         }
-//     }
-// }
-
-// impl<T: AmDist> LamellarMultiRequestHandle<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn process_result(&self, data: InternalResult) -> T {
-//         match data {
-//             InternalResult::Local(x) => {
-//                 if let Ok(result) = x.downcast::<T>() {
-//                     *result
-//                 } else {
-//                     panic!("unexpected local result  of type ");
-//                 }
-//             }
-//             InternalResult::Remote(x, darcs) => {
-//                 if let Ok(result) = x.deserialize_data::<T>() {
-//                     // we need to appropraiately set the reference counts if the returned data contains any Darcs
-//                     // we "cheat" in that we dont actually care what the Darc wraps (hence the cast to ()) we just care
-//                     // that the reference count is updated.
-//                     for darc in darcs {
-//                         match darc {
-//                             RemotePtr::NetworkDarc(darc) => {
-//                                 let temp: Darc<()> = darc.into();
-//                                 temp.des(Ok(0));
-//                                 temp.inc_local_cnt(1); //we drop temp decreasing local count, but need to account for the actual real darc (and we unfourtunately cannot enforce the T: DarcSerde bound, or at least I havent figured out how to yet)
-//                             }
-//                             RemotePtr::NetMemRegionHandle(mr) => {
-//                                 let temp: Arc<MemRegionHandleInner> = mr.into();
-//                                 temp.local_ref.fetch_add(2, Ordering::SeqCst); // Need to increase by two, 1 for temp, 1 for result
-//                             }
-//                         }
-//                     }
-//                     result
-//                 } else {
-//                     panic!("unexpected remote result  of type ");
-//                 }
-//             }
-//             InternalResult::Unit => {
-//                 if let Ok(result) = (Box::new(()) as Box<dyn std::any::Any>).downcast::<T>() {
-//                     *result
-//                 } else {
-//                     panic!("unexpected unit result  of type ");
-//                 }
-//             }
-//         }
-//     }
-// }
-
-// #[async_trait]
-// impl<T: AmDist> LamellarMultiRequest for LamellarMultiRequestHandle<T> {
-//     type Output = T;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Vec<Self::Output> {
-//         while self.inner.cnt.load(Ordering::SeqCst) > 0 {
-//             async_std::task::yield_now().await;
-//         }
-//         let mut res = vec![];
-//         let mut data = self.inner.data.lock();
-//         // println!("data len{:?}", data.len());
-//         for pe in 0..data.len() {
-//             res.push(self.process_result(data.remove(&pe).expect("result should exist")));
-//         }
-//         res
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> Vec<T> {
-//         while self.inner.cnt.load(Ordering::SeqCst) > 0 {
-//             // std::thread::yield_now();
-//             self.inner.scheduler.exec_task();
-//         }
-//         let mut res = vec![];
-//         let mut data = self.inner.data.lock();
-//         for pe in 0..data.len() {
-//             res.push(self.process_result(data.remove(&pe).expect("result should exist")));
-//         }
-//         res
-//     }
-// }
-
-// pub(crate) struct LamellarLocalRequestHandleInner {
-//     // pub(crate) ready: AtomicBool,
-//     pub(crate) ready: (Mutex<bool>, Condvar),
-//     pub(crate) waker: Mutex<Option<Waker>>,
-//     pub(crate) data: Cell<Option<LamellarAny>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
-//     pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-//     pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-//     pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
-//     pub(crate) scheduler: Arc<Scheduler>,
-//     pub(crate) user_handle: AtomicBool, //we can use this flag to optimize what happens when the request returns
-// }
-
-// impl std::fmt::Debug for LamellarLocalRequestHandleInner {
-//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//         write!(f, "LamellarLocalRequestHandleInner {{ ready: {:?}, team_outstanding_reqs {:?}, world_outstanding_reqs {:?}, tg_outstanding_reqs{:?}, user_handle {:?}}}", self.ready.0.lock(), self.team_outstanding_reqs.load(Ordering::SeqCst), self.world_outstanding_reqs.load(Ordering::SeqCst), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::SeqCst)), self.user_handle.load(Ordering::SeqCst))
-//     }
-// }
-
-// // we use the ready bool to protect access to the data field
-// unsafe impl Sync for LamellarLocalRequestHandleInner {}
-
-// //#[doc(hidden)]
-// #[derive(Debug)]
-// pub struct LamellarLocalRequestHandle<T> {
-//     pub(crate) inner: Arc<LamellarLocalRequestHandleInner>,
-//     pub(crate) _phantom: std::marker::PhantomData<T>,
-// }
-
-// impl<T> Drop for LamellarLocalRequestHandle<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn drop(&mut self) {
-//         self.inner.user_handle.store(false, Ordering::SeqCst);
-//     }
-// }
-
-// impl LamellarRequestAddResult for LamellarLocalRequestHandleInner {
-//     //#[tracing::instrument(skip_all)]
-//     fn user_held(&self) -> bool {
-//         self.user_handle.load(Ordering::SeqCst)
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn add_result(&self, _pe: usize, _sub_id: usize, data: InternalResult) {
-//         // for a single request this is only called one time by a single runtime thread so use of the cell is safe
-//         match data {
-//             InternalResult::Local(x) => self.data.set(Some(x)),
-//             InternalResult::Remote(_, _) => panic!("unexpected local result  of type "),
-//             InternalResult::Unit => self.data.set(Some(Box::new(()) as LamellarAny)),
-//         }
-
-//         // self.ready.store(true, Ordering::SeqCst);
-//         *self.ready.0.lock() = true;
-//         self.ready.1.notify_one();
-//         if let Some(waker) = self.waker.lock().take() {
-//             waker.wake();
-//         }
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn update_counters(&self) {
-//         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         // println!(
-//         //     "[{:?}] local update counter team {} world {}",
-//         //     std::thread::current().id(),
-//         //     _team_reqs - 1,
-//         //     _world_req - 1
-//         // );
-//         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-//             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-//         }
-//     }
-// }
-
-// impl<T: 'static> LamellarLocalRequestHandle<T> {
-//     //#[tracing::instrument(skip_all)]
-//     fn process_result(&self, data: LamellarAny) -> T {
-//         if let Ok(result) = data.downcast::<T>() {
-//             *result
-//         } else {
-//             panic!("unexpected local result  of type ");
-//         }
-//     }
-// }
-
-// #[async_trait]
-// impl<T: SyncSend + 'static> LamellarRequest for LamellarLocalRequestHandle<T> {
-//     type Output = T;
-//     //#[tracing::instrument(skip_all)]
-//     async fn into_future(mut self: Box<Self>) -> Self::Output {
-//         while !*self.inner.ready.0.lock() {
-//             async_std::task::yield_now().await;
-//         }
-//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
-//     }
-//     //#[tracing::instrument(skip_all)]
-//     fn blocking_wait(&self) -> T {
-//         // let mut ready_lock = self.inner.ready.0.lock();
-//         // while !*ready_lock {
-//         while !*self.inner.ready.0.lock() {
-//             // std::thread::yield_now();
-//             // self.inner.ready.1.wait(&mut ready_lock);
-//             self.inner.scheduler.exec_task();
-//         }
-//         self.process_result(self.inner.data.replace(None).expect("result should exist"))
-//     }
-
-//     fn ready(&self) -> bool {
-//         let ready = *self.inner.ready.0.lock();
-//         // println!("ready: {}", ready);
-//         ready
-//     }
-
-//     fn set_waker(&mut self, waker: &Waker) {
-//         *self.inner.waker.lock() = Some(waker);
-//     }
-// }
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 40590b49..ae76e751 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -18,7 +18,8 @@ use futures_util::future::join_all;
 use futures_util::{Future, StreamExt};
 use parking_lot::Mutex;
 use pin_project::{pin_project, pinned_drop};
-use std::collections::{BTreeMap, HashMap};
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::hash::Hash;
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
@@ -36,6 +37,7 @@ pub(crate) struct TaskGroupAmHandleInner {
     world_outstanding_reqs: Arc<AtomicUsize>,
     tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
     pub(crate) scheduler: Arc<Scheduler>,
+    // pending_reqs: Arc<Mutex<HashSet<usize>>>,
 }
 
 //#[doc(hidden)]
@@ -61,16 +63,18 @@ impl LamellarRequestAddResult for TaskGroupAmHandleInner {
     fn add_result(&self, _pe: usize, sub_id: usize, data: InternalResult) {
         self.data.lock().insert(sub_id, data);
         if let Some(waker) = self.wakers.lock().remove(&sub_id) {
+            // println!("waker found for sub_id {}", sub_id);
             waker.wake();
         }
     }
-    fn update_counters(&self) {
+    fn update_counters(&self, _sub_id: usize) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         // println!("tg update counter team {} world {}",_team_reqs-1,_world_req-1);
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         }
+        // self.pending_reqs.lock().remove(&sub_id);
     }
 }
 
@@ -204,6 +208,7 @@ pub(crate) struct TaskGroupMultiAmHandleInner {
     world_outstanding_reqs: Arc<AtomicUsize>,
     tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
     pub(crate) scheduler: Arc<Scheduler>,
+    // pending_reqs: Arc<Mutex<HashSet<usize>>>,
 }
 
 //#[doc(hidden)]
@@ -237,13 +242,14 @@ impl LamellarRequestAddResult for TaskGroupMultiAmHandleInner {
             waker.wake();
         }
     }
-    fn update_counters(&self) {
+    fn update_counters(&self, _sub_id: usize) {
         let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         // println!("tg update counter team {} world {}",_team_reqs-1,_world_req-1);
         if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
             tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
         }
+        // self.pending_reqs.lock().remove(&sub_id);
     }
 }
 
@@ -466,8 +472,10 @@ impl<T: 'static> LamellarRequest for TaskGroupLocalAmHandle<T> {
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
         let data = self.inner.data.lock();
         if data.contains_key(&self.sub_id) {
+            // println!("request ready {:?}", self.sub_id);
             true
         } else {
+            // println!("request not ready setting waker {:?}", self.sub_id);
             //this can probably be optimized similar to set_waker of MultiAmHandle
             // where we check if the waker already exists and if it wakes to same task
             self.inner.wakers.lock().insert(self.sub_id, waker.clone());
@@ -571,6 +579,8 @@ pub struct LamellarTaskGroup {
     rt_req: Arc<LamellarRequestResult>, //for exec_pe requests
     rt_multi_req: Arc<LamellarRequestResult>, //for exec_all requests
     rt_local_req: Arc<LamellarRequestResult>, //for exec_local requests
+
+    // pub(crate) pending_reqs: Arc<Mutex<HashSet<usize>>>,
 }
 
 impl ActiveMessaging for LamellarTaskGroup {
@@ -664,6 +674,7 @@ impl LamellarTaskGroup {
         let team = team.into().team.clone();
         let counters = AMCounters::new();
         let cnt = Arc::new(AtomicUsize::new(1)); //this lamellarTaskGroup instance represents 1 handle (even though we maintain a single and multi req handle)
+        // let pending_reqs = Arc::new(Mutex::new(HashSet::new()));
         let req = Arc::new(TaskGroupAmHandleInner {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
@@ -672,6 +683,7 @@ impl LamellarTaskGroup {
             world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
             scheduler: team.scheduler.clone(),
+            // pending_reqs: pending_reqs.clone(),
         });
         let rt_req = Arc::new(LamellarRequestResult::TgAm(req.clone()));
         let multi_req = Arc::new(TaskGroupMultiAmHandleInner {
@@ -683,6 +695,7 @@ impl LamellarTaskGroup {
             world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
             scheduler: team.scheduler.clone(),
+            // pending_reqs: pending_reqs.clone(),
         });
         let rt_multi_req = Arc::new(LamellarRequestResult::TgMultiAm(multi_req.clone()));
         let local_req = Arc::new(TaskGroupAmHandleInner {
@@ -693,6 +706,7 @@ impl LamellarTaskGroup {
             world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
             tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
             scheduler: team.scheduler.clone(),
+            // pending_reqs: pending_reqs.clone(),
         });
         let rt_local_req = Arc::new(LamellarRequestResult::TgAm(local_req.clone()));
         LamellarTaskGroup {
@@ -709,6 +723,7 @@ impl LamellarTaskGroup {
             rt_req: rt_req,
             rt_multi_req: rt_multi_req,
             rt_local_req: rt_local_req,
+            // pending_reqs: pending_reqs,
         }
     }
 
@@ -734,14 +749,18 @@ impl LamellarTaskGroup {
             }
             if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!(
-                    "in task group wait_all mype: {:?} cnt: {:?} {:?}",
+                    "in task group wait_all mype: {:?} cnt: team {:?} team {:?} tg {:?} tg {:?}",
                     self.team.world_pe,
                     self.team.team_counters.send_req_cnt.load(Ordering::SeqCst),
                     self.team
                         .team_counters
                         .outstanding_reqs
                         .load(Ordering::SeqCst),
+                    self.counters.send_req_cnt.load(Ordering::SeqCst),
+                    self.counters.outstanding_reqs.load(Ordering::SeqCst),
+                    // self.pending_reqs.lock()
                 );
+                self.team.scheduler.print_status();
                 temp_now = Instant::now();
             }
         }
@@ -793,6 +812,7 @@ impl LamellarTaskGroup {
             id: self.multi_id,
             sub_id: self.sub_id_counter.fetch_add(1, Ordering::SeqCst),
         };
+        // self.pending_reqs.lock().insert(req_id.sub_id);
 
         let req_data = ReqMetaData {
             src: self.team.world_pe,
@@ -834,6 +854,7 @@ impl LamellarTaskGroup {
             id: self.id,
             sub_id: self.sub_id_counter.fetch_add(1, Ordering::SeqCst),
         };
+        // self.pending_reqs.lock().insert(req_id.sub_id);
         let req_data = ReqMetaData {
             src: self.team.world_pe,
             dst: Some(self.team.arch.world_pe(pe).expect("pe not member of team")),
@@ -880,6 +901,7 @@ impl LamellarTaskGroup {
             id: self.local_id,
             sub_id: self.sub_id_counter.fetch_add(1, Ordering::SeqCst),
         };
+        // self.pending_reqs.lock().insert(req_id.sub_id);
         let req_data = ReqMetaData {
             src: self.team.world_pe,
             dst: Some(self.team.world_pe),
diff --git a/src/scheduler.rs b/src/scheduler.rs
index fbc1cd36..00794cfd 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -510,6 +510,17 @@ impl Scheduler {
         self.executor.clone()
     }
 
+    pub(crate) fn print_status(&self) {
+        println!(
+            "status: {:?} num tasks: {:?} max tasks: {:?} num ams  {:?} max ams {:?}",
+            self.status.load(Ordering::SeqCst),
+            self.num_tasks.load(Ordering::SeqCst),
+            self.max_tasks.load(Ordering::SeqCst),
+            self.num_ams.load(Ordering::SeqCst),
+            self.max_ams.load(Ordering::SeqCst)
+        );
+    }
+
     pub(crate) fn active(&self) -> bool {
         // if self.status.load(Ordering::SeqCst) == SchedulerStatus::Finished as u8 {
         //     println!(

From 31ccf09def9be4d25e07b70dec040f4366e52f3b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 31 Jul 2024 16:27:12 -0700
Subject: [PATCH 066/116] fix for including generic trait bounds in derived
 active message return types

---
 Cargo.toml          |  6 ++---
 impl/src/gen_am.rs  |  6 ++---
 impl/src/replace.rs | 58 ++++++++++++++++++++++-----------------------
 3 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5e715c1e..0a7b53a6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lamellar"
-version = "0.6.1"
+version = "0.7"
 authors = ["Ryan D. Friese <ryan.friese@pnnl.gov>",  "Roberto Gioiosa <roberto.gioiosa@pnnl.gov>", "Joseph Cottam <joseph.cottam@pnnl.gov>","Greg Roek <gregory.roek@pnnl.gov>","Erdal Mutlu <erdal.mutlu@pnnl.gov>"]
 edition = "2021"
 description = "Lamellar is an asynchronous tasking runtime for HPC systems developed in RUST."
@@ -14,8 +14,8 @@ categories = ["asynchronous","concurrency", "network-programming","science"]
 [dependencies]
 lamellar-impl = { version = "0.6.0", path = "impl" }
 #rofisys = { version ="0.3", optional = true }
-#rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
-rofisys = { path = "../rofi-sys-junction", optional = true}
+rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
+#rofisys = { path = "../rofi-sys-junction", optional = true}
 inventory = "0.3" 
 serde = { version = "1.0.147", features = ["derive"] }
 serde_bytes = "0.11.7"
diff --git a/impl/src/gen_am.rs b/impl/src/gen_am.rs
index 25baf8aa..7b1129b1 100644
--- a/impl/src/gen_am.rs
+++ b/impl/src/gen_am.rs
@@ -351,7 +351,7 @@ pub(crate) fn impl_return_struct(
     bytes_buf: bool,
     local: bool,
 ) -> proc_macro2::TokenStream {
-    let (_impl_generics, ty_generics, where_clause) = generics.split_for_impl();
+    let (impl_generics, _ty_generics, where_clause) = generics.split_for_impl();
 
     let generic_phantoms = generics.type_params().fold(quote! {}, |acc, t| {
         let name = quote::format_ident!("_phantom_{}", t.ident.to_string().to_lowercase());
@@ -363,7 +363,7 @@ pub(crate) fn impl_return_struct(
     let mut the_ret_struct = if bytes_buf {
         quote! {
             #am_data_header
-            struct #ret_struct_name #ty_generics #where_clause{
+            struct #ret_struct_name #impl_generics #where_clause{
                 val: serde_bytes::ByteBuf,
                 #generic_phantoms
             }
@@ -371,7 +371,7 @@ pub(crate) fn impl_return_struct(
     } else {
         quote! {
             #am_data_header
-            struct #ret_struct_name #ty_generics #where_clause{
+            struct #ret_struct_name #impl_generics #where_clause{
                 val: #ret_type,
                 #generic_phantoms
             }
diff --git a/impl/src/replace.rs b/impl/src/replace.rs
index 5403218e..b147a8b5 100644
--- a/impl/src/replace.rs
+++ b/impl/src/replace.rs
@@ -2,7 +2,7 @@ use quote::{format_ident, quote, quote_spanned, ToTokens};
 use syn::fold::Fold;
 use syn::parse::Result;
 use syn::spanned::Spanned;
-use syn::visit_mut::VisitMut;
+// use syn::visit_mut::VisitMut;
 use syn::{parse_quote, parse_quote_spanned};
 
 use crate::parse::{FormatArgs, VecArgs};
@@ -105,7 +105,7 @@ impl Fold for ReplaceSelf {
 }
 
 pub(crate) struct LamellarDSLReplace;
-pub(crate) struct DarcReplace;
+// pub(crate) struct DarcReplace;
 
 impl Fold for LamellarDSLReplace {
     fn fold_expr_path(&mut self, path: syn::ExprPath) -> syn::ExprPath {
@@ -186,32 +186,32 @@ impl Fold for LamellarDSLReplace {
     }
 }
 
-impl VisitMut for DarcReplace {
-    fn visit_ident_mut(&mut self, i: &mut syn::Ident) {
-        let span = i.span();
-        // println!("ident: {:?}",i);
-        if i.to_string() == "Darc<" {
-            *i = syn::Ident::new("__AmDarc", span);
-        }
-        // println!("ident: {:?}",i);
-        syn::visit_mut::visit_ident_mut(self, i);
-    }
+// impl VisitMut for DarcReplace {
+//     fn visit_ident_mut(&mut self, i: &mut syn::Ident) {
+//         let span = i.span();
+//         // println!("ident: {:?}",i);
+//         if i.to_string() == "Darc<" {
+//             *i = syn::Ident::new("__AmDarc", span);
+//         }
+//         // println!("ident: {:?}",i);
+//         syn::visit_mut::visit_ident_mut(self, i);
+//     }
 
-    fn visit_macro_mut(&mut self, i: &mut syn::Macro) {
-        let args: Result<FormatArgs> = i.parse_body();
+//     fn visit_macro_mut(&mut self, i: &mut syn::Macro) {
+//         let args: Result<FormatArgs> = i.parse_body();
 
-        if args.is_ok() {
-            let tok_str = i.tokens.to_string();
-            let tok_str = tok_str.split(",").collect::<Vec<&str>>();
-            let mut new_tok_str: String = tok_str[0].to_string();
-            for i in 1..tok_str.len() {
-                new_tok_str +=
-                    &(",".to_owned() + &tok_str[i].to_string().replace("self", "__lamellar_data"));
-            }
-            i.tokens = new_tok_str.parse().unwrap();
-        } else {
-            // println!("warning unrecognized macro {:?} in lamellar::am expansion can currently only handle format like macros", i);
-        }
-        syn::visit_mut::visit_macro_mut(self, i);
-    }
-}
+//         if args.is_ok() {
+//             let tok_str = i.tokens.to_string();
+//             let tok_str = tok_str.split(",").collect::<Vec<&str>>();
+//             let mut new_tok_str: String = tok_str[0].to_string();
+//             for i in 1..tok_str.len() {
+//                 new_tok_str +=
+//                     &(",".to_owned() + &tok_str[i].to_string().replace("self", "__lamellar_data"));
+//             }
+//             i.tokens = new_tok_str.parse().unwrap();
+//         } else {
+//             // println!("warning unrecognized macro {:?} in lamellar::am expansion can currently only handle format like macros", i);
+//         }
+//         syn::visit_mut::visit_macro_mut(self, i);
+//     }
+// }

From 3b844fd14a2b70881e33e7d58fe763a54dbdcebf Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 31 Jul 2024 16:28:30 -0700
Subject: [PATCH 067/116] fix version number

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 0a7b53a6..c36a6d8d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lamellar"
-version = "0.7"
+version = "0.7.0"
 authors = ["Ryan D. Friese <ryan.friese@pnnl.gov>",  "Roberto Gioiosa <roberto.gioiosa@pnnl.gov>", "Joseph Cottam <joseph.cottam@pnnl.gov>","Greg Roek <gregory.roek@pnnl.gov>","Erdal Mutlu <erdal.mutlu@pnnl.gov>"]
 edition = "2021"
 description = "Lamellar is an asynchronous tasking runtime for HPC systems developed in RUST."

From 49c59766bfdfc895321739427a6846f0f464bd04 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Wed, 7 Aug 2024 15:50:26 -0700
Subject: [PATCH 068/116] fix sub array block distribution calculations

---
 lamellar_run.sh                        |   2 +-
 src/array.rs                           |   8 +-
 src/array/unsafe.rs                    |  99 +++++++++++--------
 tests/add.rs                           |   1 +
 tests/array/arithmetic_ops/add_test.rs | 132 ++++++++++++-------------
 5 files changed, 132 insertions(+), 110 deletions(-)

diff --git a/lamellar_run.sh b/lamellar_run.sh
index 5f3af138..b099d862 100755
--- a/lamellar_run.sh
+++ b/lamellar_run.sh
@@ -34,7 +34,7 @@ for pe in $(seq 0 $ENDPE); do
     echo "more threads ${E_CORE} than cores ${NPROC} "
     exit
   fi
-  LAMELLAR_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" &>> ${pe}_out.txt & 
+  LAMELLAR_BACKEND="shmem" LAMELLAR_MEM_SIZE=$((1*1024*1024*1024)) LAMELLAR_THREADS=$((THREADS)) LAMELLAR_NUM_PES=$NUMPES LAMELLAR_PE_ID=$pe LAMELLAR_JOB_ID=$JOBID  $bin  "${@:2}" & 
   S_CORE=$(($E_CORE ))
   E_CORE=$(($S_CORE + $THREADS))
 done
diff --git a/src/array.rs b/src/array.rs
index 8add8161..48662d97 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -194,10 +194,10 @@ crate::inventory::collect!(ReduceKey);
 // lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
 // lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
-// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f64);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f64);
+lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
 lamellar_impl::generate_reductions_for_type_rt!(false, u128);
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index a87a407f..16ac52bb 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -116,10 +116,10 @@ pub(crate) struct UnsafeArrayInnerWeak {
     pub(crate) data: WeakDarc<UnsafeArrayData>,
     pub(crate) distribution: Distribution,
     orig_elem_per_pe: usize,
-    orig_remaining_elems: usize,
-    elem_size: usize, //for bytes array will be size of T, for T array will be 1
-    offset: usize,    //relative to size of T
-    size: usize,      //relative to size of T
+    orig_remaining_elems: usize, // the number of elements that can't be evenly divided amongst all PES
+    elem_size: usize,            //for bytes array will be size of T, for T array will be 1
+    offset: usize,               //relative to size of T
+    size: usize,                 //relative to size of T
     sub: bool,
 }
 
@@ -1161,8 +1161,16 @@ impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
     //#[tracing::instrument(skip_all)]
     fn pe_and_offset_for_global_index(&self, index: usize) -> Option<(usize, usize)> {
         if self.inner.sub {
+            // println!("sub array {index}");
             let pe = self.inner.pe_for_dist_index(index)?;
+            // println!("pe: {pe}");
             let offset = self.inner.pe_sub_offset_for_dist_index(pe, index)?;
+            // println!(
+            //     "sub array index {index} pe {pe} offset {offset} size {} {} {}",
+            //     self.inner.size,
+            //     self.inner.num_elems_pe(0),
+            //     self.inner.num_elems_pe(1)
+            // );
             Some((pe, offset))
         } else {
             self.inner.full_pe_and_offset_for_global_index(index)
@@ -1222,7 +1230,14 @@ impl<T: Dist> SubArray<T> for UnsafeArray<T> {
                 start, end, self.inner.size
             );
         }
-        // println!("new inner {:?} {:?} {:?} {:?}",start,end,end-start,self.sub_array_offset + start);
+        // println!(
+        //     "new inner start {:?} end {:?} size {:?} cur offset {:?} cur size {:?}",
+        //     start,
+        //     end,
+        //     end - start,
+        //     self.inner.offset,
+        //     self.inner.size
+        // );
         let mut inner = self.inner.clone();
         inner.offset += start;
         inner.size = end - start;
@@ -1534,22 +1549,21 @@ impl UnsafeArrayInner {
             match self.distribution {
                 Distribution::Block => {
                     let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);
-                    let mut elem_per_pe = self.orig_elem_per_pe;
-                    if rem_index < self.size {
-                        elem_per_pe += 1;
-                    } else {
-                        global_index = global_index - rem_index;
-                    }
+
                     let (pe, offset) = if global_index < rem_index {
-                        (global_index / elem_per_pe, global_index % elem_per_pe)
+                        //index is on a pe with extra elems
+                        let pe = global_index / (self.orig_elem_per_pe + 1); // accounts for the reamining elems
+                        let offset = global_index - (pe * (self.orig_elem_per_pe + 1));
+                        (pe, offset)
                     } else {
-                        (
-                            rem_index / elem_per_pe
-                                + (global_index - rem_index) / self.orig_elem_per_pe,
-                            global_index % self.orig_elem_per_pe,
-                        )
+                        //index is on a pe without extra elems
+                        let temp_index = global_index - rem_index; //get the remainin index after accounter for PEs with extra elements
+                        let temp_pe = temp_index / self.orig_elem_per_pe; //the pe after accounting for PEs with extra elements
+                        let pe = self.orig_remaining_elems  // N pes that have extra elements
+                            + temp_pe;
+                        let offset = temp_index - (temp_pe * self.orig_elem_per_pe);
+                        (pe, offset)
                     };
-
                     Some((pe, offset))
                 }
                 Distribution::Cyclic => {
@@ -1568,21 +1582,19 @@ impl UnsafeArrayInner {
     //index is relative to (sub)array (i.e. index=0 doesnt necessarily live on pe=0)
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn pe_for_dist_index(&self, index: usize) -> Option<usize> {
+        // println!("pe_for_dist_index {index} {}", self.size);
         if self.size > index {
             let mut global_index = index + self.offset;
+
             match self.distribution {
                 Distribution::Block => {
                     let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);
-                    let mut elem_per_pe = self.orig_elem_per_pe;
-                    if rem_index < self.size {
-                        elem_per_pe += 1;
-                    } else {
-                        global_index = global_index - rem_index;
-                    }
                     let pe = if global_index < rem_index {
-                        global_index / elem_per_pe
+                        global_index / (self.orig_elem_per_pe + 1) // accounts for the reamining elems
                     } else {
-                        rem_index / elem_per_pe + (global_index - rem_index) / self.orig_elem_per_pe
+                        self.orig_remaining_elems  // N pes that have extra elements
+                            + ((global_index - rem_index) //get the remainin index after accounter for PEs with extra elements
+                            / self.orig_elem_per_pe)
                     };
                     Some(pe)
                 }
@@ -1596,21 +1608,21 @@ impl UnsafeArrayInner {
     //index relative to subarray, return offset relative to subarray
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn pe_full_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
+        // println!("pe_full_offset_for_dist_index pe {pe} index {index}");
         let mut global_index = self.offset + index;
-
         match self.distribution {
             Distribution::Block => {
                 let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);
-                let mut elem_per_pe = self.orig_elem_per_pe;
-                if rem_index < self.size {
-                    elem_per_pe += 1;
-                } else {
-                    global_index = global_index - rem_index;
-                }
+                // println!("\tindex: {index} offset {} size {} global_index {global_index} rem_index {rem_index}",self.offset, self.size);
                 let offset = if global_index < rem_index {
-                    global_index % elem_per_pe
+                    //index is on a pe with extra elems
+                    global_index - (pe * (self.orig_elem_per_pe + 1))
                 } else {
-                    global_index % self.orig_elem_per_pe
+                    //index is on a pe without extra elems
+                    let temp_index = global_index - rem_index; //get the remainin index after accounter for PEs with extra elements
+                    let temp_pe = temp_index / self.orig_elem_per_pe; //the pe after accounting for PEs with extra elements
+
+                    temp_index - (temp_pe * self.orig_elem_per_pe)
                 };
                 Some(offset)
             }
@@ -1627,13 +1639,22 @@ impl UnsafeArrayInner {
 
     //index relative to subarray, return offset relative to subarray
     pub(crate) fn pe_sub_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
-        let offset = self.pe_full_offset_for_dist_index(pe, index)?;
+        // println!(
+        //     "pe_sub_offset_for_dist_index index {index} pe {pe} offset {}",
+        //     self.offset
+        // );
+        let start_pe = self.pe_for_dist_index(0)?;
+
         match self.distribution {
             Distribution::Block => {
-                if self.offset <= offset {
-                    Some(offset - self.offset)
+                if start_pe == pe {
+                    if index < self.size {
+                        Some(index)
+                    } else {
+                        None
+                    }
                 } else {
-                    None
+                    self.pe_full_offset_for_dist_index(pe, index)
                 }
             }
             Distribution::Cyclic => {
diff --git a/tests/add.rs b/tests/add.rs
index f185c5cd..598b916b 100644
--- a/tests/add.rs
+++ b/tests/add.rs
@@ -89,6 +89,7 @@ macro_rules! create_add_tests {
 create_add_tests!(
     (UnsafeArray, LocalLockArray, AtomicArray), // (UnsafeArray, AtomicArray, GenericAtomicArray, LocalLockArray),
     ("Block", "Cyclic"),
+    // (u8, f64),
     (u8, u16, u32, u128, usize, i8, i16, i32, i128, isize, f32, f64),
     (2, 3, 4),
     (4, 19, 128)
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 64c016e2..c237d8e2 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -487,78 +487,78 @@ fn main() {
     };
 
     match array.as_str() {
-        "UnsafeArray" => match elem.as_str() {
-            "u8" => add_test!(UnsafeArray, u8, len, dist_type),
-            "u16" => add_test!(UnsafeArray, u16, len, dist_type),
-            "u32" => add_test!(UnsafeArray, u32, len, dist_type),
-            "u64" => add_test!(UnsafeArray, u64, len, dist_type),
-            "u128" => add_test!(UnsafeArray, u128, len, dist_type),
-            "usize" => add_test!(UnsafeArray, usize, len, dist_type),
-            "i8" => add_test!(UnsafeArray, i8, len, dist_type),
-            "i16" => add_test!(UnsafeArray, i16, len, dist_type),
-            "i32" => add_test!(UnsafeArray, i32, len, dist_type),
-            "i64" => add_test!(UnsafeArray, i64, len, dist_type),
-            "i128" => add_test!(UnsafeArray, i128, len, dist_type),
-            "isize" => add_test!(UnsafeArray, isize, len, dist_type),
-            "f32" => add_test!(UnsafeArray, f32, len, dist_type),
-            "f64" => add_test!(UnsafeArray, f64, len, dist_type),
-            "input" => input_test!(UnsafeArray, len, dist_type),
-            _ => eprintln!("unsupported element type"),
-        },
+        // "UnsafeArray" => match elem.as_str() {
+        //     "u8" => add_test!(UnsafeArray, u8, len, dist_type),
+        //     "u16" => add_test!(UnsafeArray, u16, len, dist_type),
+        //     "u32" => add_test!(UnsafeArray, u32, len, dist_type),
+        //     "u64" => add_test!(UnsafeArray, u64, len, dist_type),
+        //     "u128" => add_test!(UnsafeArray, u128, len, dist_type),
+        //     "usize" => add_test!(UnsafeArray, usize, len, dist_type),
+        //     "i8" => add_test!(UnsafeArray, i8, len, dist_type),
+        //     "i16" => add_test!(UnsafeArray, i16, len, dist_type),
+        //     "i32" => add_test!(UnsafeArray, i32, len, dist_type),
+        //     "i64" => add_test!(UnsafeArray, i64, len, dist_type),
+        //     "i128" => add_test!(UnsafeArray, i128, len, dist_type),
+        //     "isize" => add_test!(UnsafeArray, isize, len, dist_type),
+        //     "f32" => add_test!(UnsafeArray, f32, len, dist_type),
+        //     "f64" => add_test!(UnsafeArray, f64, len, dist_type),
+        //     "input" => input_test!(UnsafeArray, len, dist_type),
+        //     _ => eprintln!("unsupported element type"),
+        // },
         "AtomicArray" => match elem.as_str() {
             "u8" => add_test!(AtomicArray, u8, len, dist_type),
-            "u16" => add_test!(AtomicArray, u16, len, dist_type),
-            "u32" => add_test!(AtomicArray, u32, len, dist_type),
-            "u64" => add_test!(AtomicArray, u64, len, dist_type),
-            "u128" => add_test!(AtomicArray, u128, len, dist_type),
-            "usize" => add_test!(AtomicArray, usize, len, dist_type),
-            "i8" => add_test!(AtomicArray, i8, len, dist_type),
-            "i16" => add_test!(AtomicArray, i16, len, dist_type),
-            "i32" => add_test!(AtomicArray, i32, len, dist_type),
-            "i64" => add_test!(AtomicArray, i64, len, dist_type),
-            "i128" => add_test!(AtomicArray, i128, len, dist_type),
-            "isize" => add_test!(AtomicArray, isize, len, dist_type),
-            "f32" => add_test!(AtomicArray, f32, len, dist_type),
+            // "u16" => add_test!(AtomicArray, u16, len, dist_type),
+            // "u32" => add_test!(AtomicArray, u32, len, dist_type),
+            // "u64" => add_test!(AtomicArray, u64, len, dist_type),
+            // "u128" => add_test!(AtomicArray, u128, len, dist_type),
+            // "usize" => add_test!(AtomicArray, usize, len, dist_type),
+            // "i8" => add_test!(AtomicArray, i8, len, dist_type),
+            // "i16" => add_test!(AtomicArray, i16, len, dist_type),
+            // "i32" => add_test!(AtomicArray, i32, len, dist_type),
+            // "i64" => add_test!(AtomicArray, i64, len, dist_type),
+            // "i128" => add_test!(AtomicArray, i128, len, dist_type),
+            // "isize" => add_test!(AtomicArray, isize, len, dist_type),
+            // "f32" => add_test!(AtomicArray, f32, len, dist_type),
             "f64" => add_test!(AtomicArray, f64, len, dist_type),
             "input" => input_test!(AtomicArray, len, dist_type),
             _ => eprintln!("unsupported element type"),
         },
-        "LocalLockArray" => match elem.as_str() {
-            "u8" => add_test!(LocalLockArray, u8, len, dist_type),
-            "u16" => add_test!(LocalLockArray, u16, len, dist_type),
-            "u32" => add_test!(LocalLockArray, u32, len, dist_type),
-            "u64" => add_test!(LocalLockArray, u64, len, dist_type),
-            "u128" => add_test!(LocalLockArray, u128, len, dist_type),
-            "usize" => add_test!(LocalLockArray, usize, len, dist_type),
-            "i8" => add_test!(LocalLockArray, i8, len, dist_type),
-            "i16" => add_test!(LocalLockArray, i16, len, dist_type),
-            "i32" => add_test!(LocalLockArray, i32, len, dist_type),
-            "i64" => add_test!(LocalLockArray, i64, len, dist_type),
-            "i128" => add_test!(LocalLockArray, i128, len, dist_type),
-            "isize" => add_test!(LocalLockArray, isize, len, dist_type),
-            "f32" => add_test!(LocalLockArray, f32, len, dist_type),
-            "f64" => add_test!(LocalLockArray, f64, len, dist_type),
-            "input" => input_test!(LocalLockArray, len, dist_type),
-            _ => eprintln!("unsupported element type"),
-        },
-        "GlobalLockArray" => match elem.as_str() {
-            "u8" => add_test!(GlobalLockArray, u8, len, dist_type),
-            "u16" => add_test!(GlobalLockArray, u16, len, dist_type),
-            "u32" => add_test!(GlobalLockArray, u32, len, dist_type),
-            "u64" => add_test!(GlobalLockArray, u64, len, dist_type),
-            "u128" => add_test!(GlobalLockArray, u128, len, dist_type),
-            "usize" => add_test!(GlobalLockArray, usize, len, dist_type),
-            "i8" => add_test!(GlobalLockArray, i8, len, dist_type),
-            "i16" => add_test!(GlobalLockArray, i16, len, dist_type),
-            "i32" => add_test!(GlobalLockArray, i32, len, dist_type),
-            "i64" => add_test!(GlobalLockArray, i64, len, dist_type),
-            "i128" => add_test!(GlobalLockArray, i128, len, dist_type),
-            "isize" => add_test!(GlobalLockArray, isize, len, dist_type),
-            "f32" => add_test!(GlobalLockArray, f32, len, dist_type),
-            "f64" => add_test!(GlobalLockArray, f64, len, dist_type),
-            "input" => input_test!(GlobalLockArray, len, dist_type),
-            _ => {} //eprintln!("unsupported element type"),
-        },
+        // "LocalLockArray" => match elem.as_str() {
+        //     "u8" => add_test!(LocalLockArray, u8, len, dist_type),
+        //     "u16" => add_test!(LocalLockArray, u16, len, dist_type),
+        //     "u32" => add_test!(LocalLockArray, u32, len, dist_type),
+        //     "u64" => add_test!(LocalLockArray, u64, len, dist_type),
+        //     "u128" => add_test!(LocalLockArray, u128, len, dist_type),
+        //     "usize" => add_test!(LocalLockArray, usize, len, dist_type),
+        //     "i8" => add_test!(LocalLockArray, i8, len, dist_type),
+        //     "i16" => add_test!(LocalLockArray, i16, len, dist_type),
+        //     "i32" => add_test!(LocalLockArray, i32, len, dist_type),
+        //     "i64" => add_test!(LocalLockArray, i64, len, dist_type),
+        //     "i128" => add_test!(LocalLockArray, i128, len, dist_type),
+        //     "isize" => add_test!(LocalLockArray, isize, len, dist_type),
+        //     "f32" => add_test!(LocalLockArray, f32, len, dist_type),
+        //     "f64" => add_test!(LocalLockArray, f64, len, dist_type),
+        //     "input" => input_test!(LocalLockArray, len, dist_type),
+        //     _ => eprintln!("unsupported element type"),
+        // },
+        // "GlobalLockArray" => match elem.as_str() {
+        //     "u8" => add_test!(GlobalLockArray, u8, len, dist_type),
+        //     "u16" => add_test!(GlobalLockArray, u16, len, dist_type),
+        //     "u32" => add_test!(GlobalLockArray, u32, len, dist_type),
+        //     "u64" => add_test!(GlobalLockArray, u64, len, dist_type),
+        //     "u128" => add_test!(GlobalLockArray, u128, len, dist_type),
+        //     "usize" => add_test!(GlobalLockArray, usize, len, dist_type),
+        //     "i8" => add_test!(GlobalLockArray, i8, len, dist_type),
+        //     "i16" => add_test!(GlobalLockArray, i16, len, dist_type),
+        //     "i32" => add_test!(GlobalLockArray, i32, len, dist_type),
+        //     "i64" => add_test!(GlobalLockArray, i64, len, dist_type),
+        //     "i128" => add_test!(GlobalLockArray, i128, len, dist_type),
+        //     "isize" => add_test!(GlobalLockArray, isize, len, dist_type),
+        //     "f32" => add_test!(GlobalLockArray, f32, len, dist_type),
+        //     "f64" => add_test!(GlobalLockArray, f64, len, dist_type),
+        //     "input" => input_test!(GlobalLockArray, len, dist_type),
+        //     _ => {} //eprintln!("unsupported element type"),
+        // },
         _ => eprintln!("unsupported array type"),
     }
 }

From e6694c81e4af01fa785e8937e4f6e27738023b11 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 9 Aug 2024 12:55:02 -0700
Subject: [PATCH 069/116] clean up some debugging leftovers

---
 examples/misc/dist_hashmap.rs          | 117 ++++++++++++++++++++++
 src/active_messaging/prelude.rs        |   4 +-
 src/array.rs                           |   4 +-
 tests/array/arithmetic_ops/add_test.rs | 132 ++++++++++++-------------
 4 files changed, 187 insertions(+), 70 deletions(-)
 create mode 100644 examples/misc/dist_hashmap.rs

diff --git a/examples/misc/dist_hashmap.rs b/examples/misc/dist_hashmap.rs
new file mode 100644
index 00000000..e04d93c8
--- /dev/null
+++ b/examples/misc/dist_hashmap.rs
@@ -0,0 +1,117 @@
+use lamellar::active_messaging::prelude::*;
+use lamellar::darc::prelude::*;
+use serde::{Deserialize, Serialize};
+
+use std::collections::HashMap;
+use std::future::Future;
+use std::sync::Arc;
+
+#[derive(Clone)]
+struct DistHashMap {
+    num_pes: usize,
+    team: Arc<LamellarTeam>,
+    data: LocalRwDarc<HashMap<i32, i32>>, //unforunately we can't use generics here due to constraints imposed by ActiveMessages
+}
+
+impl DistHashMap {
+    fn new(world: &LamellarWorld, num_pes: usize) -> Self {
+        let team = world.team();
+        DistHashMap {
+            num_pes,
+            team: team.clone(),
+            data: LocalRwDarc::new(team, HashMap::new()).unwrap(),
+        }
+    }
+
+    fn get_key_pe(&self, k: i32) -> usize {
+        k as usize % self.num_pes
+    }
+    fn add(&self, k: i32, v: i32) -> impl Future {
+        let dest_pe = self.get_key_pe(k);
+        self.team.exec_am_pe(
+            dest_pe,
+            DistHashMapOp {
+                data: self.data.clone(),
+                cmd: DistCmd::Add(k, v),
+            },
+        )
+    }
+
+    fn get(&self, k: i32) -> impl Future<Output = DistCmdResult> {
+        let dest_pe = self.get_key_pe(k);
+        self.team.exec_am_pe(
+            dest_pe,
+            DistHashMapOp {
+                data: self.data.clone(),
+                cmd: DistCmd::Get(k),
+            },
+        )
+    }
+}
+
+// this is one way we can implement commands for the distributed hashmap
+// a maybe more efficient way to do this would be to create and individual
+// active message for each command
+// #[AmData(Debug, Clone)] eventually we will be able to do this... instead  derive serialize and deserialize directly with serde
+#[derive(Debug, Clone, Serialize, Deserialize)]
+enum DistCmd {
+    Add(i32, i32),
+    Get(i32),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+enum DistCmdResult {
+    Add,
+    Get(i32),
+}
+
+#[AmData(Debug, Clone)]
+struct DistHashMapOp {
+    data: LocalRwDarc<HashMap<i32, i32>>, //unforunately we can't use generics here due to constraints imposed by ActiveMessages
+    cmd: DistCmd,
+}
+
+#[am]
+impl LamellarAM for DistHashMapOp {
+    async fn exec(self) -> DistCmdResult {
+        match self.cmd {
+            DistCmd::Add(k, v) => {
+                self.data.write().await.insert(k, v);
+                DistCmdResult::Add
+            }
+            DistCmd::Get(k) => {
+                let data = self.data.read().await;
+                let v = data.get(&k);
+                println!("{}", v.unwrap());
+                DistCmdResult::Get(k)
+            }
+        }
+    }
+}
+
+fn main() {
+    let world = LamellarWorldBuilder::new().build();
+    let my_pe = world.my_pe();
+    let num_pes = world.num_pes();
+    world.barrier();
+    let distributed_map = DistHashMap::new(&world, num_pes);
+
+    for i in 0..10 {
+        // we can ignore the 'unused' result here because we call 'wait_all' below, otherwise to ensure each request completed we could use 'block_on'
+        world.block_on(distributed_map.add(i, i));
+    }
+    world.wait_all();
+    world.barrier();
+    let map_clone = distributed_map.clone();
+    world.block_on(async move {
+        for i in 0..10 {
+            println!("{}: {:?}", i, map_clone.get(i).await);
+        }
+    });
+
+    world.barrier();
+    println!(
+        "[{my_pe}] local data: {:?}",
+        distributed_map.data.blocking_read()
+    );
+}
diff --git a/src/active_messaging/prelude.rs b/src/active_messaging/prelude.rs
index 09fdffe1..0dcab32a 100644
--- a/src/active_messaging/prelude.rs
+++ b/src/active_messaging/prelude.rs
@@ -5,8 +5,8 @@
 // };
 // pub use crate::active_messaging::{ActiveMessaging, LamellarAM, LocalAM};
 pub use crate::active_messaging::{
-    am, local_am, typed_am_group, ActiveMessaging, AmData, AmGroupData, AmHandle, AmLocalData,
-    LamellarSerde,
+    am, local_am, typed_am_group, ActiveMessaging, AmData, AmDist, AmGroupData, AmHandle,
+    AmLocalData, LamellarAM, LamellarSerde, RemoteActiveMessage, Serde,
 };
 
 pub use crate::async_trait;
diff --git a/src/array.rs b/src/array.rs
index 48662d97..aac1d98f 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -196,8 +196,8 @@ crate::inventory::collect!(ReduceKey);
 
 // lamellar_impl::generate_reductions_for_type_rt!(false, f64);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f64);
-lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
 lamellar_impl::generate_reductions_for_type_rt!(false, u128);
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index c237d8e2..64c016e2 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -487,78 +487,78 @@ fn main() {
     };
 
     match array.as_str() {
-        // "UnsafeArray" => match elem.as_str() {
-        //     "u8" => add_test!(UnsafeArray, u8, len, dist_type),
-        //     "u16" => add_test!(UnsafeArray, u16, len, dist_type),
-        //     "u32" => add_test!(UnsafeArray, u32, len, dist_type),
-        //     "u64" => add_test!(UnsafeArray, u64, len, dist_type),
-        //     "u128" => add_test!(UnsafeArray, u128, len, dist_type),
-        //     "usize" => add_test!(UnsafeArray, usize, len, dist_type),
-        //     "i8" => add_test!(UnsafeArray, i8, len, dist_type),
-        //     "i16" => add_test!(UnsafeArray, i16, len, dist_type),
-        //     "i32" => add_test!(UnsafeArray, i32, len, dist_type),
-        //     "i64" => add_test!(UnsafeArray, i64, len, dist_type),
-        //     "i128" => add_test!(UnsafeArray, i128, len, dist_type),
-        //     "isize" => add_test!(UnsafeArray, isize, len, dist_type),
-        //     "f32" => add_test!(UnsafeArray, f32, len, dist_type),
-        //     "f64" => add_test!(UnsafeArray, f64, len, dist_type),
-        //     "input" => input_test!(UnsafeArray, len, dist_type),
-        //     _ => eprintln!("unsupported element type"),
-        // },
+        "UnsafeArray" => match elem.as_str() {
+            "u8" => add_test!(UnsafeArray, u8, len, dist_type),
+            "u16" => add_test!(UnsafeArray, u16, len, dist_type),
+            "u32" => add_test!(UnsafeArray, u32, len, dist_type),
+            "u64" => add_test!(UnsafeArray, u64, len, dist_type),
+            "u128" => add_test!(UnsafeArray, u128, len, dist_type),
+            "usize" => add_test!(UnsafeArray, usize, len, dist_type),
+            "i8" => add_test!(UnsafeArray, i8, len, dist_type),
+            "i16" => add_test!(UnsafeArray, i16, len, dist_type),
+            "i32" => add_test!(UnsafeArray, i32, len, dist_type),
+            "i64" => add_test!(UnsafeArray, i64, len, dist_type),
+            "i128" => add_test!(UnsafeArray, i128, len, dist_type),
+            "isize" => add_test!(UnsafeArray, isize, len, dist_type),
+            "f32" => add_test!(UnsafeArray, f32, len, dist_type),
+            "f64" => add_test!(UnsafeArray, f64, len, dist_type),
+            "input" => input_test!(UnsafeArray, len, dist_type),
+            _ => eprintln!("unsupported element type"),
+        },
         "AtomicArray" => match elem.as_str() {
             "u8" => add_test!(AtomicArray, u8, len, dist_type),
-            // "u16" => add_test!(AtomicArray, u16, len, dist_type),
-            // "u32" => add_test!(AtomicArray, u32, len, dist_type),
-            // "u64" => add_test!(AtomicArray, u64, len, dist_type),
-            // "u128" => add_test!(AtomicArray, u128, len, dist_type),
-            // "usize" => add_test!(AtomicArray, usize, len, dist_type),
-            // "i8" => add_test!(AtomicArray, i8, len, dist_type),
-            // "i16" => add_test!(AtomicArray, i16, len, dist_type),
-            // "i32" => add_test!(AtomicArray, i32, len, dist_type),
-            // "i64" => add_test!(AtomicArray, i64, len, dist_type),
-            // "i128" => add_test!(AtomicArray, i128, len, dist_type),
-            // "isize" => add_test!(AtomicArray, isize, len, dist_type),
-            // "f32" => add_test!(AtomicArray, f32, len, dist_type),
+            "u16" => add_test!(AtomicArray, u16, len, dist_type),
+            "u32" => add_test!(AtomicArray, u32, len, dist_type),
+            "u64" => add_test!(AtomicArray, u64, len, dist_type),
+            "u128" => add_test!(AtomicArray, u128, len, dist_type),
+            "usize" => add_test!(AtomicArray, usize, len, dist_type),
+            "i8" => add_test!(AtomicArray, i8, len, dist_type),
+            "i16" => add_test!(AtomicArray, i16, len, dist_type),
+            "i32" => add_test!(AtomicArray, i32, len, dist_type),
+            "i64" => add_test!(AtomicArray, i64, len, dist_type),
+            "i128" => add_test!(AtomicArray, i128, len, dist_type),
+            "isize" => add_test!(AtomicArray, isize, len, dist_type),
+            "f32" => add_test!(AtomicArray, f32, len, dist_type),
             "f64" => add_test!(AtomicArray, f64, len, dist_type),
             "input" => input_test!(AtomicArray, len, dist_type),
             _ => eprintln!("unsupported element type"),
         },
-        // "LocalLockArray" => match elem.as_str() {
-        //     "u8" => add_test!(LocalLockArray, u8, len, dist_type),
-        //     "u16" => add_test!(LocalLockArray, u16, len, dist_type),
-        //     "u32" => add_test!(LocalLockArray, u32, len, dist_type),
-        //     "u64" => add_test!(LocalLockArray, u64, len, dist_type),
-        //     "u128" => add_test!(LocalLockArray, u128, len, dist_type),
-        //     "usize" => add_test!(LocalLockArray, usize, len, dist_type),
-        //     "i8" => add_test!(LocalLockArray, i8, len, dist_type),
-        //     "i16" => add_test!(LocalLockArray, i16, len, dist_type),
-        //     "i32" => add_test!(LocalLockArray, i32, len, dist_type),
-        //     "i64" => add_test!(LocalLockArray, i64, len, dist_type),
-        //     "i128" => add_test!(LocalLockArray, i128, len, dist_type),
-        //     "isize" => add_test!(LocalLockArray, isize, len, dist_type),
-        //     "f32" => add_test!(LocalLockArray, f32, len, dist_type),
-        //     "f64" => add_test!(LocalLockArray, f64, len, dist_type),
-        //     "input" => input_test!(LocalLockArray, len, dist_type),
-        //     _ => eprintln!("unsupported element type"),
-        // },
-        // "GlobalLockArray" => match elem.as_str() {
-        //     "u8" => add_test!(GlobalLockArray, u8, len, dist_type),
-        //     "u16" => add_test!(GlobalLockArray, u16, len, dist_type),
-        //     "u32" => add_test!(GlobalLockArray, u32, len, dist_type),
-        //     "u64" => add_test!(GlobalLockArray, u64, len, dist_type),
-        //     "u128" => add_test!(GlobalLockArray, u128, len, dist_type),
-        //     "usize" => add_test!(GlobalLockArray, usize, len, dist_type),
-        //     "i8" => add_test!(GlobalLockArray, i8, len, dist_type),
-        //     "i16" => add_test!(GlobalLockArray, i16, len, dist_type),
-        //     "i32" => add_test!(GlobalLockArray, i32, len, dist_type),
-        //     "i64" => add_test!(GlobalLockArray, i64, len, dist_type),
-        //     "i128" => add_test!(GlobalLockArray, i128, len, dist_type),
-        //     "isize" => add_test!(GlobalLockArray, isize, len, dist_type),
-        //     "f32" => add_test!(GlobalLockArray, f32, len, dist_type),
-        //     "f64" => add_test!(GlobalLockArray, f64, len, dist_type),
-        //     "input" => input_test!(GlobalLockArray, len, dist_type),
-        //     _ => {} //eprintln!("unsupported element type"),
-        // },
+        "LocalLockArray" => match elem.as_str() {
+            "u8" => add_test!(LocalLockArray, u8, len, dist_type),
+            "u16" => add_test!(LocalLockArray, u16, len, dist_type),
+            "u32" => add_test!(LocalLockArray, u32, len, dist_type),
+            "u64" => add_test!(LocalLockArray, u64, len, dist_type),
+            "u128" => add_test!(LocalLockArray, u128, len, dist_type),
+            "usize" => add_test!(LocalLockArray, usize, len, dist_type),
+            "i8" => add_test!(LocalLockArray, i8, len, dist_type),
+            "i16" => add_test!(LocalLockArray, i16, len, dist_type),
+            "i32" => add_test!(LocalLockArray, i32, len, dist_type),
+            "i64" => add_test!(LocalLockArray, i64, len, dist_type),
+            "i128" => add_test!(LocalLockArray, i128, len, dist_type),
+            "isize" => add_test!(LocalLockArray, isize, len, dist_type),
+            "f32" => add_test!(LocalLockArray, f32, len, dist_type),
+            "f64" => add_test!(LocalLockArray, f64, len, dist_type),
+            "input" => input_test!(LocalLockArray, len, dist_type),
+            _ => eprintln!("unsupported element type"),
+        },
+        "GlobalLockArray" => match elem.as_str() {
+            "u8" => add_test!(GlobalLockArray, u8, len, dist_type),
+            "u16" => add_test!(GlobalLockArray, u16, len, dist_type),
+            "u32" => add_test!(GlobalLockArray, u32, len, dist_type),
+            "u64" => add_test!(GlobalLockArray, u64, len, dist_type),
+            "u128" => add_test!(GlobalLockArray, u128, len, dist_type),
+            "usize" => add_test!(GlobalLockArray, usize, len, dist_type),
+            "i8" => add_test!(GlobalLockArray, i8, len, dist_type),
+            "i16" => add_test!(GlobalLockArray, i16, len, dist_type),
+            "i32" => add_test!(GlobalLockArray, i32, len, dist_type),
+            "i64" => add_test!(GlobalLockArray, i64, len, dist_type),
+            "i128" => add_test!(GlobalLockArray, i128, len, dist_type),
+            "isize" => add_test!(GlobalLockArray, isize, len, dist_type),
+            "f32" => add_test!(GlobalLockArray, f32, len, dist_type),
+            "f64" => add_test!(GlobalLockArray, f64, len, dist_type),
+            "input" => input_test!(GlobalLockArray, len, dist_type),
+            _ => {} //eprintln!("unsupported element type"),
+        },
         _ => eprintln!("unsupported array type"),
     }
 }

From 815c081d0ae07526376523909d82ca90aae4382b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 19 Sep 2024 10:10:18 -0700
Subject: [PATCH 070/116] fix allocation of arrays on sub teams

---
 src/array/unsafe.rs | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 16ac52bb..e9129c70 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -169,8 +169,15 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
             per_pe_size += 1
         }
         // println!("new unsafe array {:?} {:?}", elem_per_pe, per_pe_size);
-        let rmr_t: MemoryRegion<T> =
-            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
+        let rmr_t: MemoryRegion<T> = if team.num_world_pes == team.num_pes {
+            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global)
+        } else {
+            MemoryRegion::new(
+                per_pe_size,
+                team.lamellae.clone(),
+                AllocationType::Sub(team.get_pes()),
+            )
+        };
         // let rmr = MemoryRegion::new(
         //     per_pe_size * std::mem::size_of::<T>(),
         //     team.lamellae.clone(),
@@ -268,8 +275,15 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         if remaining_elems > 0 {
             per_pe_size += 1
         }
-        let rmr_t: MemoryRegion<T> =
-            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global);
+        let rmr_t: MemoryRegion<T> = if team.num_world_pes == team.num_pes {
+            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global)
+        } else {
+            MemoryRegion::new(
+                per_pe_size,
+                team.lamellae.clone(),
+                AllocationType::Sub(team.get_pes()),
+            )
+        };
         // let rmr = MemoryRegion::new(
         //     per_pe_size * std::mem::size_of::<T>(),
         //     team.lamellae.clone(),
@@ -1545,7 +1559,7 @@ impl UnsafeArrayInner {
         index: usize,
     ) -> Option<(usize, usize)> {
         if self.size > index {
-            let mut global_index = index;
+            let global_index = index;
             match self.distribution {
                 Distribution::Block => {
                     let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);
@@ -1584,7 +1598,7 @@ impl UnsafeArrayInner {
     pub(crate) fn pe_for_dist_index(&self, index: usize) -> Option<usize> {
         // println!("pe_for_dist_index {index} {}", self.size);
         if self.size > index {
-            let mut global_index = index + self.offset;
+            let global_index = index + self.offset;
 
             match self.distribution {
                 Distribution::Block => {
@@ -1609,7 +1623,7 @@ impl UnsafeArrayInner {
     // //#[tracing::instrument(skip_all)]
     pub(crate) fn pe_full_offset_for_dist_index(&self, pe: usize, index: usize) -> Option<usize> {
         // println!("pe_full_offset_for_dist_index pe {pe} index {index}");
-        let mut global_index = self.offset + index;
+        let global_index = self.offset + index;
         match self.distribution {
             Distribution::Block => {
                 let rem_index = self.orig_remaining_elems * (self.orig_elem_per_pe + 1);

From 5030d3601315e400c1cca9bd9a62913abf5d7adc Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 19 Sep 2024 10:10:45 -0700
Subject: [PATCH 071/116] cleanup debug output and warnings

---
 Cargo.toml                                          |  6 +++++-
 examples/array_examples/array_am.rs                 |  1 -
 examples/darc_examples/darc.rs                      | 13 -------------
 examples/kernels/dft_proxy.rs                       |  2 --
 examples/kernels/parallel_blocked_array_gemm.rs     |  1 -
 impl/Cargo.toml                                     |  2 +-
 impl/src/lib.rs                                     |  1 -
 .../distributed_iterator/consumer/for_each.rs       |  2 +-
 src/array/operations/arithmetic.rs                  |  1 -
 src/array/unsafe/operations.rs                      |  1 -
 src/lamellae/comm.rs                                |  1 +
 src/lamellar_task_group.rs                          |  7 +++----
 src/lamellar_team.rs                                |  6 +++---
 tests/array/arithmetic_ops/fetch_add_test.rs        |  2 --
 tests/array_into.rs                                 |  2 +-
 15 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index c36a6d8d..e52508dd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,7 +12,7 @@ keywords = ["hpc","runtime","pgas","distributed","asynchronous"]
 categories = ["asynchronous","concurrency", "network-programming","science"]
 
 [dependencies]
-lamellar-impl = { version = "0.6.0", path = "impl" }
+lamellar-impl = { version = "0.7.0", path = "impl" }
 #rofisys = { version ="0.3", optional = true }
 rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
 #rofisys = { path = "../rofi-sys-junction", optional = true}
@@ -365,6 +365,10 @@ path="examples/array_examples/global_lock_array.rs"
 name="histo"
 path="examples/array_examples/histo.rs"
 
+[[example]]
+name="single_pe_array"
+path="examples/array_examples/single_pe_array.rs"
+
 ##------------ RDMA Examples -----------------##
 [[example]]
 name="rdma_put"
diff --git a/examples/array_examples/array_am.rs b/examples/array_examples/array_am.rs
index c0a92fac..72d29de3 100644
--- a/examples/array_examples/array_am.rs
+++ b/examples/array_examples/array_am.rs
@@ -79,7 +79,6 @@ fn main() {
         world.block_on(unsafe { array.put(0, &local_mem_region) });
     }
 
-    println!("here!!! {:?}", my_pe);
     array.print();
     for i in unsafe { array.local_as_slice() } {
         while *i != 255_u8 {
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index 725e1b8a..08beb185 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -49,18 +49,13 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
 
-    println!("here 0");
-
     let even_team = world.create_team_from_arch(StridedArch::new(
         0,                                      // start pe
         2,                                      // stride
         (num_pes as f64 / 2.0).ceil() as usize, //num pes in team
     ));
 
-    println!("here 1");
-
     let global_darc = GlobalRwDarc::new(world.team(), 0).unwrap();
-    println!("here 2");
     let read_lock = global_darc.blocking_read();
     println!("I have the read lock!!!! {:?}", my_pe);
     drop(read_lock);
@@ -68,7 +63,6 @@ fn main() {
     println!("I have the write lock!!!! {:?}", my_pe);
     std::thread::sleep(std::time::Duration::from_secs(1));
     drop(write_lock);
-    println!("here3");
     //----
     let local_darc = LocalRwDarc::new(world.team(), 10).unwrap();
     println!("created new local rw");
@@ -81,11 +75,8 @@ fn main() {
             },
         },
     };
-    // println!("here 4");
     let darc1 = Darc::new(world.team(), 10).unwrap();
-    // println!("here 5");
     let darc2 = Darc::new(world.team(), 20).unwrap();
-    // println!("here 6");
     if let Some(team) = even_team {
         let team_darc = Darc::new(team.clone(), AtomicUsize::new(10));
         let mut tg = typed_am_group!(DarcAm, team.clone());
@@ -103,20 +94,16 @@ fn main() {
                 darc_tuple: (darc1.clone(), darc2.clone()),
                 my_arc: Darc::new(team.clone(), Arc::new(0)).unwrap(),
             };
-            println!("here 7");
             let _ = team.exec_am_pe(0, darc_am.clone()).spawn();
             let _ = team.exec_am_all(darc_am.clone()).spawn();
             tg.add_am_pe(0, darc_am.clone());
             tg.add_am_all(darc_am);
             team.block_on(tg.exec());
-            println!("here 8");
         } else {
-            // println!("here");
             *local_darc.blocking_write() += 1;
         }
     }
     // --------
-    println!("here 9");
 
     // drop(darc1);
     // drop(darc2);
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index 4f98e80d..e6b56040 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -842,9 +842,7 @@ fn main() {
                     .for_each(|elem| *elem = 0.0)
                     .block();
             }
-            println!("here 0");
             full_spectrum_array.wait_all();
-            println!("here 1");
             full_spectrum_array.barrier();
             times[ti].push(dft_lamellar_array_opt_test(
                 full_signal_array.clone(),
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index cd345e23..94f5f9dd 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -114,7 +114,6 @@ fn main() {
                     .take(blocksize) // we only need to take blocksize columns
                     .collect::<Vec<_>>()
                     .await; //gather local memory regions containing each columns data
-                            // println!("here");
                             //need to store the submatrix in a contiguous memory segment for use with the MatrixMultiply library
                 let mut b_block_vec = vec![0.0; blocksize * blocksize];
                 for (j, col) in b_block.iter().enumerate() {
diff --git a/impl/Cargo.toml b/impl/Cargo.toml
index 7e4bce4c..35f66007 100644
--- a/impl/Cargo.toml
+++ b/impl/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lamellar-impl"
-version = "0.6.1"
+version = "0.7.0"
 authors = ["Ryan D. Friese <ryan.friese@pnnl.gov>",  "Roberto Gioiosa <roberto.gioiosa@pnnl.gov>", "Joseph Cottam <joseph.cottam@pnnl.gov>","Greg Roek <gregory.roek@pnnl.gov>","Erdal Mutlu <erdal.mutlu@pnnl.gov>"]
 edition = "2021"
 description = "Lamellar is an asynchronous tasking runtime for HPC systems developed in RUST."
diff --git a/impl/src/lib.rs b/impl/src/lib.rs
index 0c7b55f5..4d37d669 100644
--- a/impl/src/lib.rs
+++ b/impl/src/lib.rs
@@ -222,7 +222,6 @@ fn check_for_am_group(args: &Punctuated<syn::Meta, syn::Token![,]>) -> bool {
 pub fn AmData(args: TokenStream, input: TokenStream) -> TokenStream {
     let args =
         parse_macro_input!(args with Punctuated<syn::Meta, syn::Token![,]>::parse_terminated);
-    // println!("here");
     derive_am_data(input, args, quote! {__lamellar}, false, false, false)
 }
 
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index 17a2b6e8..24c3d15e 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -276,7 +276,7 @@ impl Future for DistIterForEachHandle {
                     }
                 }
             }
-            StateProj::Reqs(inner, barrier_id) => {
+            StateProj::Reqs(inner, _barrier_id) => {
                 // println!(
                 //     "reqs remaining {:?} barrier_id {:?}",
                 //     inner.reqs.len(),
diff --git a/src/array/operations/arithmetic.rs b/src/array/operations/arithmetic.rs
index 720725d7..93dda91d 100644
--- a/src/array/operations/arithmetic.rs
+++ b/src/array/operations/arithmetic.rs
@@ -238,7 +238,6 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
         index: impl OpInput<'a, usize>,
         val: impl OpInput<'a, T>,
     ) -> ArrayFetchBatchOpHandle<T> {
-        // println!("here in batch_fetch_add");
         self.inner_array().initiate_batch_fetch_op_2(
             val,
             index,
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 60cf85fe..ef724b8c 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -360,7 +360,6 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         op: ArrayOpCmd<T>,
         byte_array: LamellarByteArray,
     ) -> ArrayFetchBatchOpHandle<T> {
-        // println!("here in batch fetch op 2");
         let (indices, i_len) = index.as_op_input();
         let (vals, v_len) = val.as_op_input();
         let max_local_size = (0..self.num_pes())
diff --git a/src/lamellae/comm.rs b/src/lamellae/comm.rs
index 70bbdb84..3dd6f404 100644
--- a/src/lamellae/comm.rs
+++ b/src/lamellae/comm.rs
@@ -120,6 +120,7 @@ pub(crate) trait CommOps {
     fn iget<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]);
     // fn iget_relative<T: Remote>(&self, pe: usize, src_addr: usize, dst_addr: &mut [T]);
     #[allow(non_snake_case)]
+    #[allow(dead_code)]
     fn MB_sent(&self) -> f64;
     fn force_shutdown(&self);
 }
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index ae76e751..65f9cd98 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -18,8 +18,7 @@ use futures_util::future::join_all;
 use futures_util::{Future, StreamExt};
 use parking_lot::Mutex;
 use pin_project::{pin_project, pinned_drop};
-use std::collections::{BTreeMap, HashMap, HashSet};
-use std::hash::Hash;
+use std::collections::{BTreeMap, HashMap};
 use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
@@ -580,7 +579,7 @@ pub struct LamellarTaskGroup {
     rt_multi_req: Arc<LamellarRequestResult>, //for exec_all requests
     rt_local_req: Arc<LamellarRequestResult>, //for exec_local requests
 
-    // pub(crate) pending_reqs: Arc<Mutex<HashSet<usize>>>,
+                                        // pub(crate) pending_reqs: Arc<Mutex<HashSet<usize>>>,
 }
 
 impl ActiveMessaging for LamellarTaskGroup {
@@ -674,7 +673,7 @@ impl LamellarTaskGroup {
         let team = team.into().team.clone();
         let counters = AMCounters::new();
         let cnt = Arc::new(AtomicUsize::new(1)); //this lamellarTaskGroup instance represents 1 handle (even though we maintain a single and multi req handle)
-        // let pending_reqs = Arc::new(Mutex::new(HashSet::new()));
+                                                 // let pending_reqs = Arc::new(Mutex::new(HashSet::new()));
         let req = Arc::new(TaskGroupAmHandleInner {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 920e70c1..28a2a189 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -721,9 +721,9 @@ impl From<Pin<Arc<LamellarTeamRT>>> for LamellarTeamRemotePtr {
     }
 }
 
-// Internal Runtime handle to a lamellar team
-// used by proc macros
-// users should never need to use this
+/// Internal Runtime handle to a lamellar team
+/// this is typicallyused by proc macros (hence why it is public)
+/// end users should never use this directly and should instead use the [LamellarTeam] and/or [LamellarWorld] struct
 #[doc(hidden)]
 pub struct LamellarTeamRT {
     #[allow(dead_code)]
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 489c7026..8e348c40 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -387,7 +387,6 @@ macro_rules! check_results {
                 req_cnt+=1;
             }
         }
-        // println!("here");
         #[allow(unused_unsafe)]
         for (i, elem) in unsafe { buffered_onesided_iter!($array_ty,$array).into_iter().enumerate() }{
             let val = *elem;
@@ -410,7 +409,6 @@ macro_rules! check_results {
                 break;
             }
         }
-        // println!("here2");
         $array.barrier();
         // let init_val = 0;
         initialize_array2!($array_ty, $array, init_val);
diff --git a/tests/array_into.rs b/tests/array_into.rs
index 72d7bf20..9a47c4c6 100644
--- a/tests/array_into.rs
+++ b/tests/array_into.rs
@@ -48,7 +48,7 @@ macro_rules! create_into_tests {
 
 create_into_tests!((
     UnsafeArray,
-    LocalOnlyArray,
+    // LocalOnlyArray,
     ReadOnlyArray,
     AtomicArray,
     LocalLockArray,

From 313c13ef67ef696df738a0780d1bdd47c5c15e18 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 19 Sep 2024 16:29:28 -0700
Subject: [PATCH 072/116] fix off by one error when getting a single 'int'
 sized message, fix a sanitizer use after scope warning in native atomic

---
 src/array/native_atomic.rs     | 260 ++++++++++++++++++---------------
 src/lamellae/rofi/rofi_comm.rs |  12 +-
 src/memregion.rs               |  11 +-
 3 files changed, 163 insertions(+), 120 deletions(-)

diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index ff664ea4..b0cc6481 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -244,39 +244,45 @@ macro_rules! impl_shift {
     ($self:ident,$op:tt,$val:ident) => {
         // mul, div
         unsafe {
-            *match $self.array.orig_t {
+            match $self.array.orig_t {
                 //deref to the original type
                 NativeAtomicType::I8 => {
-                    &compare_exchange_op!(i8, AtomicI8, $self, $val, $op) as *const i8 as *mut T
+                    *(&compare_exchange_op!(i8, AtomicI8, $self, $val, $op) as *const i8 as *mut T)
                 }
                 NativeAtomicType::I16 => {
-                    &compare_exchange_op!(i16, AtomicI16, $self, $val, $op) as *const i16 as *mut T
+                    *(&compare_exchange_op!(i16, AtomicI16, $self, $val, $op) as *const i16
+                        as *mut T)
                 }
                 NativeAtomicType::I32 => {
-                    &compare_exchange_op!(i32, AtomicI32, $self, $val, $op) as *const i32 as *mut T
+                    *(&compare_exchange_op!(i32, AtomicI32, $self, $val, $op) as *const i32
+                        as *mut T)
                 }
                 NativeAtomicType::I64 => {
-                    &compare_exchange_op!(i64, AtomicI64, $self, $val, $op) as *const i64 as *mut T
+                    *(&compare_exchange_op!(i64, AtomicI64, $self, $val, $op) as *const i64
+                        as *mut T)
                 }
                 NativeAtomicType::Isize => {
-                    &compare_exchange_op!(isize, AtomicIsize, $self, $val, $op) as *const isize
-                        as *mut T
+                    *(&compare_exchange_op!(isize, AtomicIsize, $self, $val, $op) as *const isize
+                        as *mut T)
                 }
                 NativeAtomicType::U8 => {
-                    &compare_exchange_op!(u8, AtomicU8, $self, $val, $op) as *const u8 as *mut T
+                    *(&compare_exchange_op!(u8, AtomicU8, $self, $val, $op) as *const u8 as *mut T)
                 }
                 NativeAtomicType::U16 => {
-                    &compare_exchange_op!(u16, AtomicU16, $self, $val, $op) as *const u16 as *mut T
+                    *(&compare_exchange_op!(u16, AtomicU16, $self, $val, $op) as *const u16
+                        as *mut T)
                 }
                 NativeAtomicType::U32 => {
-                    &compare_exchange_op!(u32, AtomicU32, $self, $val, $op) as *const u32 as *mut T
+                    *(&compare_exchange_op!(u32, AtomicU32, $self, $val, $op) as *const u32
+                        as *mut T)
                 }
                 NativeAtomicType::U64 => {
-                    &compare_exchange_op!(u64, AtomicU64, $self, $val, $op) as *const u64 as *mut T
+                    *(&compare_exchange_op!(u64, AtomicU64, $self, $val, $op) as *const u64
+                        as *mut T)
                 }
                 NativeAtomicType::Usize => {
-                    &compare_exchange_op!(usize, AtomicUsize, $self, $val, $op) as *const usize
-                        as *mut T
+                    *(&compare_exchange_op!(usize, AtomicUsize, $self, $val, $op) as *const usize
+                        as *mut T)
                 }
             }
         }
@@ -286,39 +292,45 @@ macro_rules! impl_mul_div {
     ($self:ident,$op:tt,$val:ident) => {
         // mul, div
         unsafe {
-            *match $self.array.orig_t {
+            match $self.array.orig_t {
                 //deref to the original type
                 NativeAtomicType::I8 => {
-                    &compare_exchange_op!(i8, AtomicI8, $self, $val, $op) as *const i8 as *mut T
+                    *(&compare_exchange_op!(i8, AtomicI8, $self, $val, $op) as *const i8 as *mut T)
                 }
                 NativeAtomicType::I16 => {
-                    &compare_exchange_op!(i16, AtomicI16, $self, $val, $op) as *const i16 as *mut T
+                    *(&compare_exchange_op!(i16, AtomicI16, $self, $val, $op) as *const i16
+                        as *mut T)
                 }
                 NativeAtomicType::I32 => {
-                    &compare_exchange_op!(i32, AtomicI32, $self, $val, $op) as *const i32 as *mut T
+                    *(&compare_exchange_op!(i32, AtomicI32, $self, $val, $op) as *const i32
+                        as *mut T)
                 }
                 NativeAtomicType::I64 => {
-                    &compare_exchange_op!(i64, AtomicI64, $self, $val, $op) as *const i64 as *mut T
+                    *(&compare_exchange_op!(i64, AtomicI64, $self, $val, $op) as *const i64
+                        as *mut T)
                 }
                 NativeAtomicType::Isize => {
-                    &compare_exchange_op!(isize, AtomicIsize, $self, $val, $op) as *const isize
-                        as *mut T
+                    *(&compare_exchange_op!(isize, AtomicIsize, $self, $val, $op) as *const isize
+                        as *mut T)
                 }
                 NativeAtomicType::U8 => {
-                    &compare_exchange_op!(u8, AtomicU8, $self, $val, $op) as *const u8 as *mut T
+                    *(&compare_exchange_op!(u8, AtomicU8, $self, $val, $op) as *const u8 as *mut T)
                 }
                 NativeAtomicType::U16 => {
-                    &compare_exchange_op!(u16, AtomicU16, $self, $val, $op) as *const u16 as *mut T
+                    *(&compare_exchange_op!(u16, AtomicU16, $self, $val, $op) as *const u16
+                        as *mut T)
                 }
                 NativeAtomicType::U32 => {
-                    &compare_exchange_op!(u32, AtomicU32, $self, $val, $op) as *const u32 as *mut T
+                    *(&compare_exchange_op!(u32, AtomicU32, $self, $val, $op) as *const u32
+                        as *mut T)
                 }
                 NativeAtomicType::U64 => {
-                    &compare_exchange_op!(u64, AtomicU64, $self, $val, $op) as *const u64 as *mut T
+                    *(&compare_exchange_op!(u64, AtomicU64, $self, $val, $op) as *const u64
+                        as *mut T)
                 }
                 NativeAtomicType::Usize => {
-                    &compare_exchange_op!(usize, AtomicUsize, $self, $val, $op) as *const usize
-                        as *mut T
+                    *(&compare_exchange_op!(usize, AtomicUsize, $self, $val, $op) as *const usize
+                        as *mut T)
                 }
             }
         }
@@ -329,52 +341,58 @@ macro_rules! impl_add_sub_and_or_xor {
         //add,sub,and,or (returns value)
         unsafe {
             let slice = $self.array.__local_as_mut_slice();
-            *match $self.array.orig_t {
+            match $self.array.orig_t {
                 //deref to the original type
-                NativeAtomicType::I8 => &slice_as_atomic!(i8, AtomicI8, slice)[$self.local_index]
-                    .$op(as_type!($val, i8), Ordering::SeqCst)
-                    as *const i8 as *mut T,
+                NativeAtomicType::I8 => {
+                    *(&slice_as_atomic!(i8, AtomicI8, slice)[$self.local_index]
+                        .$op(as_type!($val, i8), Ordering::SeqCst) as *const i8
+                        as *mut T)
+                }
                 NativeAtomicType::I16 => {
-                    &slice_as_atomic!(i16, AtomicI16, slice)[$self.local_index]
+                    *(&slice_as_atomic!(i16, AtomicI16, slice)[$self.local_index]
                         .$op(as_type!($val, i16), Ordering::SeqCst) as *const i16
-                        as *mut T
+                        as *mut T)
                 }
                 NativeAtomicType::I32 => {
-                    &slice_as_atomic!(i32, AtomicI32, slice)[$self.local_index]
+                    *(&slice_as_atomic!(i32, AtomicI32, slice)[$self.local_index]
                         .$op(as_type!($val, i32), Ordering::SeqCst) as *const i32
-                        as *mut T
+                        as *mut T)
                 }
                 NativeAtomicType::I64 => {
-                    &slice_as_atomic!(i64, AtomicI64, slice)[$self.local_index]
+                    *(&slice_as_atomic!(i64, AtomicI64, slice)[$self.local_index]
                         .$op(as_type!($val, i64), Ordering::SeqCst) as *const i64
-                        as *mut T
-                }
-                NativeAtomicType::Isize => &slice_as_atomic!(isize, AtomicIsize, slice)
-                    [$self.local_index]
-                    .$op(as_type!($val, isize), Ordering::SeqCst)
-                    as *const isize as *mut T,
-                NativeAtomicType::U8 => &slice_as_atomic!(u8, AtomicU8, slice)[$self.local_index]
-                    .$op(as_type!($val, u8), Ordering::SeqCst)
-                    as *const u8 as *mut T,
+                        as *mut T)
+                }
+                NativeAtomicType::Isize => {
+                    *(&slice_as_atomic!(isize, AtomicIsize, slice)[$self.local_index]
+                        .$op(as_type!($val, isize), Ordering::SeqCst)
+                        as *const isize as *mut T)
+                }
+                NativeAtomicType::U8 => {
+                    *(&slice_as_atomic!(u8, AtomicU8, slice)[$self.local_index]
+                        .$op(as_type!($val, u8), Ordering::SeqCst) as *const u8
+                        as *mut T)
+                }
                 NativeAtomicType::U16 => {
-                    &slice_as_atomic!(u16, AtomicU16, slice)[$self.local_index]
+                    *(&slice_as_atomic!(u16, AtomicU16, slice)[$self.local_index]
                         .$op(as_type!($val, u16), Ordering::SeqCst) as *const u16
-                        as *mut T
+                        as *mut T)
                 }
                 NativeAtomicType::U32 => {
-                    &slice_as_atomic!(u32, AtomicU32, slice)[$self.local_index]
+                    *(&slice_as_atomic!(u32, AtomicU32, slice)[$self.local_index]
                         .$op(as_type!($val, u32), Ordering::SeqCst) as *const u32
-                        as *mut T
+                        as *mut T)
                 }
                 NativeAtomicType::U64 => {
-                    &slice_as_atomic!(u64, AtomicU64, slice)[$self.local_index]
+                    *(&slice_as_atomic!(u64, AtomicU64, slice)[$self.local_index]
                         .$op(as_type!($val, u64), Ordering::SeqCst) as *const u64
-                        as *mut T
+                        as *mut T)
+                }
+                NativeAtomicType::Usize => {
+                    *(&slice_as_atomic!(usize, AtomicUsize, slice)[$self.local_index]
+                        .$op(as_type!($val, usize), Ordering::SeqCst)
+                        as *const usize as *mut T)
                 }
-                NativeAtomicType::Usize => &slice_as_atomic!(usize, AtomicUsize, slice)
-                    [$self.local_index]
-                    .$op(as_type!($val, usize), Ordering::SeqCst)
-                    as *const usize as *mut T,
             }
         }
     };
@@ -486,37 +504,39 @@ macro_rules! impl_swap {
         //swap
 
         unsafe {
-            *match $self.array.orig_t {
+            match $self.array.orig_t {
                 //deref to the original type
                 NativeAtomicType::I8 => {
-                    &compare_exchange_op!(i8, AtomicI8, $self, $val) as *const i8 as *mut T
+                    *(&compare_exchange_op!(i8, AtomicI8, $self, $val) as *const i8 as *mut T)
                 }
                 NativeAtomicType::I16 => {
-                    &compare_exchange_op!(i16, AtomicI16, $self, $val) as *const i16 as *mut T
+                    *(&compare_exchange_op!(i16, AtomicI16, $self, $val) as *const i16 as *mut T)
                 }
                 NativeAtomicType::I32 => {
-                    &compare_exchange_op!(i32, AtomicI32, $self, $val) as *const i32 as *mut T
+                    *(&compare_exchange_op!(i32, AtomicI32, $self, $val) as *const i32 as *mut T)
                 }
                 NativeAtomicType::I64 => {
-                    &compare_exchange_op!(i64, AtomicI64, $self, $val) as *const i64 as *mut T
+                    *(&compare_exchange_op!(i64, AtomicI64, $self, $val) as *const i64 as *mut T)
                 }
                 NativeAtomicType::Isize => {
-                    &compare_exchange_op!(isize, AtomicIsize, $self, $val) as *const isize as *mut T
+                    *(&compare_exchange_op!(isize, AtomicIsize, $self, $val) as *const isize
+                        as *mut T)
                 }
                 NativeAtomicType::U8 => {
-                    &compare_exchange_op!(u8, AtomicU8, $self, $val) as *const u8 as *mut T
+                    *(&compare_exchange_op!(u8, AtomicU8, $self, $val) as *const u8 as *mut T)
                 }
                 NativeAtomicType::U16 => {
-                    &compare_exchange_op!(u16, AtomicU16, $self, $val) as *const u16 as *mut T
+                    *(&compare_exchange_op!(u16, AtomicU16, $self, $val) as *const u16 as *mut T)
                 }
                 NativeAtomicType::U32 => {
-                    &compare_exchange_op!(u32, AtomicU32, $self, $val) as *const u32 as *mut T
+                    *(&compare_exchange_op!(u32, AtomicU32, $self, $val) as *const u32 as *mut T)
                 }
                 NativeAtomicType::U64 => {
-                    &compare_exchange_op!(u64, AtomicU64, $self, $val) as *const u64 as *mut T
+                    *(&compare_exchange_op!(u64, AtomicU64, $self, $val) as *const u64 as *mut T)
                 }
                 NativeAtomicType::Usize => {
-                    &compare_exchange_op!(usize, AtomicUsize, $self, $val) as *const usize as *mut T
+                    *(&compare_exchange_op!(usize, AtomicUsize, $self, $val) as *const usize
+                        as *mut T)
                 }
             }
         }
@@ -526,39 +546,47 @@ macro_rules! impl_swap {
 macro_rules! impl_compare_exchange {
     ($self:ident,$old:ident,$val:ident) => {
         unsafe {
-            *match $self.array.orig_t {
+            match $self.array.orig_t {
                 //deref to the original type
-                NativeAtomicType::I8 => &compare_exchange_op!(i8, AtomicI8, $self, $old, $val)
-                    as *const Result<i8, i8>
-                    as *mut Result<T, T>,
-                NativeAtomicType::I16 => &compare_exchange_op!(i16, AtomicI16, $self, $old, $val)
-                    as *const Result<i16, i16>
-                    as *mut Result<T, T>,
-                NativeAtomicType::I32 => &compare_exchange_op!(i32, AtomicI32, $self, $old, $val)
-                    as *const Result<i32, i32>
-                    as *mut Result<T, T>,
-                NativeAtomicType::I64 => &compare_exchange_op!(i64, AtomicI64, $self, $old, $val)
-                    as *const Result<i64, i64>
-                    as *mut Result<T, T>,
+                NativeAtomicType::I8 => {
+                    *(&compare_exchange_op!(i8, AtomicI8, $self, $old, $val)
+                        as *const Result<i8, i8> as *mut Result<T, T>)
+                }
+                NativeAtomicType::I16 => {
+                    *(&compare_exchange_op!(i16, AtomicI16, $self, $old, $val)
+                        as *const Result<i16, i16> as *mut Result<T, T>)
+                }
+                NativeAtomicType::I32 => {
+                    *(&compare_exchange_op!(i32, AtomicI32, $self, $old, $val)
+                        as *const Result<i32, i32> as *mut Result<T, T>)
+                }
+                NativeAtomicType::I64 => {
+                    *(&compare_exchange_op!(i64, AtomicI64, $self, $old, $val)
+                        as *const Result<i64, i64> as *mut Result<T, T>)
+                }
                 NativeAtomicType::Isize => {
-                    &compare_exchange_op!(isize, AtomicIsize, $self, $old, $val)
-                        as *const Result<isize, isize> as *mut Result<T, T>
-                }
-                NativeAtomicType::U8 => &compare_exchange_op!(u8, AtomicU8, $self, $old, $val)
-                    as *const Result<u8, u8>
-                    as *mut Result<T, T>,
-                NativeAtomicType::U16 => &compare_exchange_op!(u16, AtomicU16, $self, $old, $val)
-                    as *const Result<u16, u16>
-                    as *mut Result<T, T>,
-                NativeAtomicType::U32 => &compare_exchange_op!(u32, AtomicU32, $self, $old, $val)
-                    as *const Result<u32, u32>
-                    as *mut Result<T, T>,
-                NativeAtomicType::U64 => &compare_exchange_op!(u64, AtomicU64, $self, $old, $val)
-                    as *const Result<u64, u64>
-                    as *mut Result<T, T>,
+                    *(&compare_exchange_op!(isize, AtomicIsize, $self, $old, $val)
+                        as *const Result<isize, isize> as *mut Result<T, T>)
+                }
+                NativeAtomicType::U8 => {
+                    *(&compare_exchange_op!(u8, AtomicU8, $self, $old, $val)
+                        as *const Result<u8, u8> as *mut Result<T, T>)
+                }
+                NativeAtomicType::U16 => {
+                    *(&compare_exchange_op!(u16, AtomicU16, $self, $old, $val)
+                        as *const Result<u16, u16> as *mut Result<T, T>)
+                }
+                NativeAtomicType::U32 => {
+                    *(&compare_exchange_op!(u32, AtomicU32, $self, $old, $val)
+                        as *const Result<u32, u32> as *mut Result<T, T>)
+                }
+                NativeAtomicType::U64 => {
+                    *(&compare_exchange_op!(u64, AtomicU64, $self, $old, $val)
+                        as *const Result<u64, u64> as *mut Result<T, T>)
+                }
                 NativeAtomicType::Usize => {
-                    &compare_exchange_op!(usize, AtomicUsize, $self, $old, $val)
-                        as *const Result<usize, usize> as *mut Result<T, T>
+                    *(&compare_exchange_op!(usize, AtomicUsize, $self, $old, $val)
+                        as *const Result<usize, usize> as *mut Result<T, T>)
                 }
             }
         }
@@ -568,47 +596,47 @@ macro_rules! impl_compare_exchange {
 macro_rules! impl_compare_exchange_eps {
     ($self:ident,$old:ident,$val:ident,$eps:ident) => {
         unsafe {
-            *match $self.array.orig_t {
+            match $self.array.orig_t {
                 //deref to the original type
                 NativeAtomicType::I8 => {
-                    &compare_exchange_op!(i8, AtomicI8, $self, $old, $val, $eps)
-                        as *const Result<i8, i8> as *mut Result<T, T>
+                    *(&compare_exchange_op!(i8, AtomicI8, $self, $old, $val, $eps)
+                        as *const Result<i8, i8> as *mut Result<T, T>)
                 }
                 NativeAtomicType::I16 => {
-                    &compare_exchange_op!(i16, AtomicI16, $self, $old, $val, $eps)
-                        as *const Result<i16, i16> as *mut Result<T, T>
+                    *(&compare_exchange_op!(i16, AtomicI16, $self, $old, $val, $eps)
+                        as *const Result<i16, i16> as *mut Result<T, T>)
                 }
                 NativeAtomicType::I32 => {
-                    &compare_exchange_op!(i32, AtomicI32, $self, $old, $val, $eps)
-                        as *const Result<i32, i32> as *mut Result<T, T>
+                    *(&compare_exchange_op!(i32, AtomicI32, $self, $old, $val, $eps)
+                        as *const Result<i32, i32> as *mut Result<T, T>)
                 }
                 NativeAtomicType::I64 => {
-                    &compare_exchange_op!(i64, AtomicI64, $self, $old, $val, $eps)
-                        as *const Result<i64, i64> as *mut Result<T, T>
+                    *(&compare_exchange_op!(i64, AtomicI64, $self, $old, $val, $eps)
+                        as *const Result<i64, i64> as *mut Result<T, T>)
                 }
                 NativeAtomicType::Isize => {
-                    &compare_exchange_op!(isize, AtomicIsize, $self, $old, $val, $eps)
-                        as *const Result<isize, isize> as *mut Result<T, T>
+                    *(&compare_exchange_op!(isize, AtomicIsize, $self, $old, $val, $eps)
+                        as *const Result<isize, isize> as *mut Result<T, T>)
                 }
                 NativeAtomicType::U8 => {
-                    &compare_exchange_op!(u8, AtomicU8, $self, $old, $val, $eps)
-                        as *const Result<u8, u8> as *mut Result<T, T>
+                    *(&compare_exchange_op!(u8, AtomicU8, $self, $old, $val, $eps)
+                        as *const Result<u8, u8> as *mut Result<T, T>)
                 }
                 NativeAtomicType::U16 => {
-                    &compare_exchange_op!(u16, AtomicU16, $self, $old, $val, $eps)
-                        as *const Result<u16, u16> as *mut Result<T, T>
+                    *(&compare_exchange_op!(u16, AtomicU16, $self, $old, $val, $eps)
+                        as *const Result<u16, u16> as *mut Result<T, T>)
                 }
                 NativeAtomicType::U32 => {
-                    &compare_exchange_op!(u32, AtomicU32, $self, $old, $val, $eps)
-                        as *const Result<u32, u32> as *mut Result<T, T>
+                    *(&compare_exchange_op!(u32, AtomicU32, $self, $old, $val, $eps)
+                        as *const Result<u32, u32> as *mut Result<T, T>)
                 }
                 NativeAtomicType::U64 => {
-                    &compare_exchange_op!(u64, AtomicU64, $self, $old, $val, $eps)
-                        as *const Result<u64, u64> as *mut Result<T, T>
+                    *(&compare_exchange_op!(u64, AtomicU64, $self, $old, $val, $eps)
+                        as *const Result<u64, u64> as *mut Result<T, T>)
                 }
                 NativeAtomicType::Usize => {
-                    &compare_exchange_op!(usize, AtomicUsize, $self, $old, $val, $eps)
-                        as *const Result<usize, usize> as *mut Result<T, T>
+                    *(&compare_exchange_op!(usize, AtomicUsize, $self, $old, $val, $eps)
+                        as *const Result<usize, usize> as *mut Result<T, T>)
                 }
             }
         }
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index 94473cf7..b6025ebc 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -125,6 +125,8 @@ impl RofiComm {
         let num_r = (dst_addr.len() * std::mem::size_of::<T>()) / std::mem::size_of::<R>();
         let r_ptr = dst_addr.as_ptr() as *mut T as *mut R;
 
+        // println!("num_r {:?}", num_r);
+
         let mut timer = std::time::Instant::now();
         for i in 0..num_r - 2 {
             while r_ptr.offset(i as isize).read_unaligned() == val
@@ -160,12 +162,16 @@ impl RofiComm {
     //#[tracing::instrument(skip_all)]
     fn check_buffer<T>(&self, dst_addr: &mut [T]) -> TxResult<()> {
         let bytes_len = dst_addr.len() * std::mem::size_of::<T>();
+        // if bytes_len > 0 {
+
+        // find largest int size that evenly divides bytes_len
+        // we multiply by 2 because we check two ints at a time
         unsafe {
-            if bytes_len % std::mem::size_of::<u64>() == 0 {
+            if bytes_len % (2 * std::mem::size_of::<u64>()) == 0 {
                 self.check_buffer_elems(dst_addr, ROFI_MAGIC_8)?;
-            } else if bytes_len % std::mem::size_of::<u32>() == 0 {
+            } else if bytes_len % (2 * std::mem::size_of::<u32>()) == 0 {
                 self.check_buffer_elems(dst_addr, ROFI_MAGIC_4)?;
-            } else if bytes_len % std::mem::size_of::<u16>() == 0 {
+            } else if bytes_len % (2 * std::mem::size_of::<u16>()) == 0 {
                 self.check_buffer_elems(dst_addr, ROFI_MAGIC_2)?;
             } else {
                 self.check_buffer_elems(dst_addr, ROFI_MAGIC_1)?;
diff --git a/src/memregion.rs b/src/memregion.rs
index a5bda9fe..af499e5e 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -983,7 +983,16 @@ impl<T: Dist> MemoryRegion<T> {
             let num_bytes = data.len() * std::mem::size_of::<R>();
             if let Ok(ptr) = data.as_mut_ptr() {
                 let bytes = std::slice::from_raw_parts_mut(ptr as *mut u8, num_bytes);
-                // println!("getting {:?} {:?} {:?} {:?} {:?} {:?} {:?}",pe,index,std::mem::size_of::<R>(),data.len(), num_bytes,self.size, self.num_bytes);
+                // println!(
+                //     "getting {:?} {:?} {:?} {:?} {:?} {:?} {:?}",
+                //     pe,
+                //     index,
+                //     std::mem::size_of::<R>(),
+                //     data.len(),
+                //     num_bytes,
+                //     self.size,
+                //     self.num_bytes
+                // );
                 self.rdma
                     .iget(pe, self.addr + index * std::mem::size_of::<R>(), bytes);
             //(remote pe, src, dst)

From 9fe96a5a993149995b58b9679629efe541d7c548 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 27 Sep 2024 13:31:12 -0700
Subject: [PATCH 073/116] update cargo to allow passing through building shared
 library versions on rofi/libfabric

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index e52508dd..ced9279e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,6 @@ categories = ["asynchronous","concurrency", "network-programming","science"]
 lamellar-impl = { version = "0.7.0", path = "impl" }
 #rofisys = { version ="0.3", optional = true }
 rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
-#rofisys = { path = "../rofi-sys-junction", optional = true}
 inventory = "0.3" 
 serde = { version = "1.0.147", features = ["derive"] }
 serde_bytes = "0.11.7"
@@ -72,6 +71,7 @@ opt-level = 3
 #features are strictly additive.... can't have mutual exclusitivity
 [features]
 enable-rofi=["rofisys", "libc"]
+enable-rofi-shared=["rofisys/shared","libc"]
 tokio-executor=["tokio"]
 slurm-test=[]
 default=[]

From 606d088b825b988da6b5b3ec364fc053cb44cd5b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 27 Sep 2024 14:49:08 -0700
Subject: [PATCH 074/116]  add build.rs so that we can get the shared library
 path from rofisys if its built as shareable

---
 Cargo.toml                     |  5 +++--
 src/env_var.rs                 |  4 ++--
 src/lamellae.rs                | 30 +++++++++++++++---------------
 src/lamellae/comm.rs           | 14 +++++++-------
 src/lamellae/command_queues.rs |  2 +-
 src/lib.rs                     |  2 +-
 6 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index ced9279e..52c206dd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -70,8 +70,9 @@ opt-level = 3
 
 #features are strictly additive.... can't have mutual exclusitivity
 [features]
-enable-rofi=["rofisys", "libc"]
-enable-rofi-shared=["rofisys/shared","libc"]
+enable-rofi=["rofi","rofisys", "libc"]
+enable-rofi-shared=["rofi","rofisys/shared","libc"]
+rofi=[]
 tokio-executor=["tokio"]
 slurm-test=[]
 default=[]
diff --git a/src/env_var.rs b/src/env_var.rs
index 66519df8..ada4d519 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -56,9 +56,9 @@ fn default_dissemination_factor() -> usize {
 }
 
 fn default_backend() -> String {
-    #[cfg(feature = "enable-rofi")]
+    #[cfg(feature = "rofi")]
     return "rofi".to_owned();
-    #[cfg(not(feature = "enable-rofi"))]
+    #[cfg(not(feature = "rofi"))]
     return "local".to_owned();
 }
 
diff --git a/src/lamellae.rs b/src/lamellae.rs
index 8e810524..6621106e 100755
--- a/src/lamellae.rs
+++ b/src/lamellae.rs
@@ -14,14 +14,14 @@ use comm::Comm;
 
 pub(crate) mod local_lamellae;
 use local_lamellae::{Local, LocalData};
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 mod rofi;
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 pub(crate) mod rofi_lamellae;
 
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 use rofi::rofi_comm::RofiData;
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 use rofi_lamellae::{Rofi, RofiBuilder};
 
 pub(crate) mod shmem_lamellae;
@@ -39,7 +39,7 @@ lazy_static! {
     serde::Serialize, serde::Deserialize, Debug, PartialEq, Eq, Ord, PartialOrd, Hash, Clone, Copy,
 )]
 pub enum Backend {
-    #[cfg(feature = "enable-rofi")]
+    #[cfg(feature = "rofi")]
     /// The Rofi (Rust-OFI) backend -- intended for multi process and distributed environments
     Rofi,
     /// The Local backend -- intended for single process environments
@@ -59,9 +59,9 @@ impl Default for Backend {
     fn default() -> Self {
         match config().backend.as_str() {
             "rofi" => {
-                #[cfg(feature = "enable-rofi")]
+                #[cfg(feature = "rofi")]
                 return Backend::Rofi;
-                #[cfg(not(feature = "enable-rofi"))]
+                #[cfg(not(feature = "rofi"))]
                 panic!("unable to set rofi backend, recompile with 'enable-rofi' feature")
             }
             "shmem" => {
@@ -77,9 +77,9 @@ impl Default for Backend {
 //     match std::env::var("LAMELLAE_BACKEND") {
 //         Ok(p) => match p.as_str() {
 //             "rofi" => {
-//                 #[cfg(feature = "enable-rofi")]
+//                 #[cfg(feature = "rofi")]
 //                 return Backend::Rofi;
-//                 #[cfg(not(feature = "enable-rofi"))]
+//                 #[cfg(not(feature = "rofi"))]
 //                 panic!("unable to set rofi backend, recompile with 'enable-rofi' feature")
 //             }
 //             "shmem" => {
@@ -90,9 +90,9 @@ impl Default for Backend {
 //             }
 //         },
 //         Err(_) => {
-//             #[cfg(feature = "enable-rofi")]
+//             #[cfg(feature = "rofi")]
 //             return Backend::Rofi;
-//             #[cfg(not(feature = "enable-rofi"))]
+//             #[cfg(not(feature = "rofi"))]
 //             return Backend::Local;
 //         }
 //     };
@@ -106,7 +106,7 @@ pub(crate) struct SerializeHeader {
 #[enum_dispatch(Des, SubData, SerializedDataOps)]
 #[derive(Clone, Debug)]
 pub(crate) enum SerializedData {
-    #[cfg(feature = "enable-rofi")]
+    #[cfg(feature = "rofi")]
     RofiData,
     ShmemData,
     LocalData,
@@ -135,7 +135,7 @@ pub(crate) trait SubData {
 
 #[enum_dispatch(LamellaeInit)]
 pub(crate) enum LamellaeBuilder {
-    #[cfg(feature = "enable-rofi")]
+    #[cfg(feature = "rofi")]
     RofiBuilder,
     ShmemBuilder,
     Local,
@@ -166,7 +166,7 @@ pub(crate) trait Ser {
 #[enum_dispatch(LamellaeComm, LamellaeAM, LamellaeRDMA, Ser)]
 #[derive(Debug)]
 pub(crate) enum Lamellae {
-    #[cfg(feature = "enable-rofi")]
+    #[cfg(feature = "rofi")]
     Rofi,
     Shmem,
     Local,
@@ -224,7 +224,7 @@ pub(crate) trait LamellaeRDMA: Send + Sync {
 
 pub(crate) fn create_lamellae(backend: Backend) -> LamellaeBuilder {
     match backend {
-        #[cfg(feature = "enable-rofi")]
+        #[cfg(feature = "rofi")]
         Backend::Rofi => {
             let provider = config().rofi_provider.clone();
             let domain = config().rofi_domain.clone();
diff --git a/src/lamellae/comm.rs b/src/lamellae/comm.rs
index 3dd6f404..6704b949 100644
--- a/src/lamellae/comm.rs
+++ b/src/lamellae/comm.rs
@@ -1,4 +1,4 @@
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 use crate::lamellae::rofi::rofi_comm::*;
 use crate::lamellae::shmem::shmem_comm::*;
 use crate::lamellae::{AllocationType, SerializedData};
@@ -50,12 +50,12 @@ impl std::error::Error for AllocError {}
 
 pub(crate) type AllocResult<T> = Result<T, AllocError>;
 
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 #[derive(Debug, Clone, Copy)]
 pub(crate) enum TxError {
     GetError,
 }
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 impl std::fmt::Display for TxError {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         match self {
@@ -65,9 +65,9 @@ impl std::fmt::Display for TxError {
         }
     }
 }
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 impl std::error::Error for TxError {}
-#[cfg(feature = "enable-rofi")]
+#[cfg(feature = "rofi")]
 pub(crate) type TxResult<T> = Result<T, TxError>;
 
 pub(crate) trait Remote: Copy {}
@@ -76,7 +76,7 @@ impl<T: Copy> Remote for T {}
 #[enum_dispatch(CommOps)]
 #[derive(Debug)]
 pub(crate) enum Comm {
-    #[cfg(feature = "enable-rofi")]
+    #[cfg(feature = "rofi")]
     Rofi(RofiComm),
     Shmem(ShmemComm),
 }
@@ -87,7 +87,7 @@ impl Comm {
         size: usize,
     ) -> Result<SerializedData, anyhow::Error> {
         match self.as_ref() {
-            #[cfg(feature = "enable-rofi")]
+            #[cfg(feature = "rofi")]
             Comm::Rofi(_) => Ok(RofiData::new(self.clone(), size)?.into()),
             Comm::Shmem(_) => Ok(ShmemData::new(self.clone(), size)?.into()),
         }
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index 0ce02f17..a93f0297 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -1331,7 +1331,7 @@ impl CommandQueue {
     //#[tracing::instrument(skip_all)]
     pub(crate) async fn send_data(&self, data: SerializedData, dst: usize) {
         match data {
-            #[cfg(feature = "enable-rofi")]
+            #[cfg(feature = "rofi")]
             SerializedData::RofiData(ref data) => {
                 // println!("sending: {:?} {:?}",data.relative_addr,data.len);
                 // let hash = calc_hash(data.relative_addr + self.comm.base_addr(), data.len);
diff --git a/src/lib.rs b/src/lib.rs
index eac1a680..3315fc8a 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -54,7 +54,7 @@
 //! - `shmem` -  used for multi-PE (single system, multi-process) development, useful for emulating distributed environments (communicates through shared memory)
 //! - `rofi` - used for multi-PE (multi system, multi-process) distributed development, based on the Rust OpenFabrics Interface Transport Layer (ROFI) (<https://github.com/pnnl/rofi>).
 //!     - By default support for Rofi is disabled as using it relies on both the Rofi C-library and the libfabrics library, which may not be installed on your system.
-//!     - It can be enabled by adding ```features = ["enable-rofi"]``` to the lamellar entry in your `Cargo.toml` file
+//!     - It can be enabled by adding ```features = ["enable-rofi"] or `features = ["enable-rofi-shared"]``` to the lamellar entry in your `Cargo.toml` file
 //!
 //! The long term goal for lamellar is that you can develop using the `local` backend and then when you are ready to run distributed switch to the `rofi` backend with no changes to your code.
 //! Currently the inverse is true, if it compiles and runs using `rofi` it will compile and run when using `local` and `shmem` with no changes.

From 02d0443d1c58c8e0d0467728487df9bd5b64eee2 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 3 Oct 2024 12:44:37 -0700
Subject: [PATCH 075/116] change DEADLOCK_TIMEOUT to DEADLOCK_WARNING_TIMEOUT

---
 src/env_var.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/env_var.rs b/src/env_var.rs
index ada4d519..ccb96489 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -27,7 +27,7 @@
 //!           This can be a fairly expensive operation (as the operation is synchronous across all PEs) so the runtime
 //!           will print a message at the end of execution with how many additional pools were allocated.
 //!              - if you find you are dynamically allocating new memory pools, try setting `LAMELLAR_HEAP_SIZE` to a larger value
-//! - `LAMELLAR_DEADLOCK_TIMEOUT` - the timeout in seconds before a deadlock warning is printed. Defaults to 600
+//! - `LAMELLAR_DEADLOCK_WARNING_TIMEOUT` - the timeout in seconds before a deadlock warning is printed. Defaults to 600. Note this does not cause your application to terminate
 //! - `LAMELLAR_AM_GROUP_BATCH_SIZE` - The maximum number of sub messages that will be sent in a single AMGroup Active Message, default: 10000
 //! - `LAMELLAR_BLOCKING_CALL_WARNING` - flag used to print warnings when users call barriers on worker threads. Default: true
 //! - `LAMELLAR_BARRIER_DISSEMINATION_FACTOR` - (Experimental) The dissemination factor for the n-way barrier, default: 2
@@ -139,7 +139,7 @@ fn default_rofi_domain() -> String {
 #[derive(Deserialize, Debug)]
 pub struct Config {
     /// A general timeout in seconds for various operations which may indicate a deadlock, default: 600.0 seconds
-    #[serde(default = "default_deadlock_timeout")]
+    #[serde(default = "default_deadlock_warning_timeout")]
     pub deadlock_timeout: f64,
 
     /// The maximum number of sub messages that will be sent in a single AMGroup Active Message, default: 10000

From 9cf575f37c83557acdaacd90377ccc891bf2f0c9 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 3 Oct 2024 15:18:34 -0700
Subject: [PATCH 076/116] ensure operations on unsafe arrays are marked unsafe

---
 src/array/iterator/distributed_iterator.rs    |  14 +-
 src/array/iterator/local_iterator.rs          |  12 +-
 src/array/operations.rs                       |  13 +-
 src/array/operations/access.rs                | 206 +++++
 src/array/operations/arithmetic.rs            | 813 ++++++++++++++++++
 src/array/operations/bitwise.rs               | 513 +++++++++++
 src/array/operations/compare_exchange.rs      | 334 +++++++
 src/array/operations/read_only.rs             | 121 +++
 src/array/operations/shift.rs                 | 300 +++++++
 src/array/prelude.rs                          |   2 +-
 src/array/unsafe/local_chunks.rs              |   4 +-
 src/array/unsafe/operations.rs                |  17 +-
 src/barrier.rs                                |   2 +-
 src/darc.rs                                   |   6 +-
 src/env_var.rs                                |   2 +-
 src/lamellar_alloc.rs                         |   2 +-
 src/lamellar_team.rs                          |  16 +-
 tests/array/arithmetic_ops/add_test.rs        |  57 +-
 tests/array/arithmetic_ops/div_test.rs        |   9 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |  63 +-
 tests/array/arithmetic_ops/fetch_div_test.rs  |   9 +-
 tests/array/arithmetic_ops/fetch_mul_test.rs  |   9 +-
 tests/array/arithmetic_ops/fetch_rem_test.rs  |   9 +-
 tests/array/arithmetic_ops/fetch_sub_test.rs  |  18 +-
 tests/array/arithmetic_ops/mul_test.rs        |   9 +-
 tests/array/arithmetic_ops/rem_test.rs        |   9 +-
 tests/array/arithmetic_ops/sub_test.rs        |  18 +-
 .../array/atomic_ops/compare_exchange_test.rs |  36 +-
 tests/array/atomic_ops/load_store_test.rs     |  18 +-
 tests/array/atomic_ops/swap_test.rs           |  19 +-
 tests/array/bitwise_ops/and_test.rs           |   9 +-
 tests/array/bitwise_ops/fetch_and_test.rs     |   9 +-
 tests/array/bitwise_ops/fetch_or_test.rs      |   9 +-
 tests/array/bitwise_ops/fetch_xor_test.rs     |   9 +-
 tests/array/bitwise_ops/or_test.rs            |   9 +-
 35 files changed, 2553 insertions(+), 152 deletions(-)

diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 41cbb23f..91b4a6fe 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -381,7 +381,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    /// Calls a closure on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule] policy.
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array
     ///
@@ -405,7 +405,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().for_each_with_schedule(sched, self, op)
     }
 
-    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    /// Calls a closure and immediately awaits the result on each element of a Distributed Iterator in parallel and distributed on each PE (which owns data of the iterated array) using the specififed [Schedule] policy.
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array, after this barrier no further communication is performed
     /// as each PE will only process elements local to itself
@@ -467,7 +467,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().reduce(self, op)
     }
 
-    /// Reduces the elements of the dist iterator using the provided closure and [Schedule][crate::array::iterator::Schedule] policy
+    /// Reduces the elements of the dist iterator using the provided closure and [Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the  reduced value.
     /// # Note
@@ -530,7 +530,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect(self, d)
     }
 
-    /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
+    /// Collects the elements of the distributed iterator into a new LamellarArray, using the provided [Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
     ///
@@ -617,7 +617,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().collect_async(self, d)
     }
 
-    /// Collects the awaited elements of the distributed iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
+    /// Collects the awaited elements of the distributed iterator into a new LamellarArray, using the provided [Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
     ///
@@ -691,7 +691,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().count(self)
     }
 
-    /// Counts the number of the elements of the distriubted iterator, using the provided [Schedule][crate::array::iterator::Schedule] policy
+    /// Counts the number of the elements of the distriubted iterator, using the provided [Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier and distributed reduction across all PEs in the Array.
     ///
@@ -741,7 +741,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
         self.array().sum(self)
     }
 
-    /// Sums the elements of the distributed iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
+    /// Sums the elements of the distributed iterator, using the specified [Schedule] policy
     ///
     /// Takes each element, adds them together, and returns the result.
     ///
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 7c30d0d7..9c53df04 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -399,7 +399,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().for_each_async(self, op)
     }
 
-    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule][crate::array::iterator::Schedule] policy.
+    /// Calls a closure on each element of a Local Iterator in parallel on the calling PE (the PE must have some local data of the array) using the specififed [Schedule] policy.
     ///
     /// The supplied closure must return a future.
     ///
@@ -456,7 +456,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().reduce(self, op)
     }
 
-    /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule][crate::array::iterator::Schedule] policy
+    /// Reduces the elements of the local iterator using the provided closure and specififed [Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the reduced value.
     /// # Note
@@ -511,7 +511,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect(self, d)
     }
 
-    /// Collects the elements of the local iterator into the specified container type using the specified [Schedule][crate::array::iterator::Schedule] policy
+    /// Collects the elements of the local iterator into the specified container type using the specified [Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the new container.
     /// # Note
@@ -589,7 +589,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().collect_async(self, d)
     }
 
-    /// Collects the awaited elements of the local iterator into a new LamellarArray, using the provided [Schedule][crate::array::iterator::Schedule] policy
+    /// Collects the awaited elements of the local iterator into a new LamellarArray, using the provided [Schedule] policy
     ///
     /// Calling this function invokes an implicit barrier across all PEs in the Array.
     ///
@@ -661,7 +661,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().count(self)
     }
 
-    /// Counts the number of the elements of the local iterator using the provided [Schedule][crate::array::iterator::Schedule] policy
+    /// Counts the number of the elements of the local iterator using the provided [Schedule] policy
     ///
     /// This function returns a future which needs to be driven to completion to retrieve the number of elements in the local iterator
     /// # Note
@@ -708,7 +708,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
         self.array().sum(self)
     }
 
-    /// Sums the elements of the local iterator, using the specified [Schedule][crate::array::iterator::Schedule] policy
+    /// Sums the elements of the local iterator, using the specified [Schedule] policy
     ///
     /// Takes each element, adds them together, and returns the result.
     ///
diff --git a/src/array/operations.rs b/src/array/operations.rs
index 585a3f6e..b39a5d80 100644
--- a/src/array/operations.rs
+++ b/src/array/operations.rs
@@ -16,19 +16,22 @@ pub use handle::{
     ArrayBatchOpHandle, ArrayFetchBatchOpHandle, ArrayOpHandle, ArrayResultBatchOpHandle,
 };
 pub(crate) mod access;
-pub use access::{AccessOps, LocalAtomicOps};
+pub use access::{AccessOps, LocalAtomicOps, UnsafeAccessOps};
 pub(crate) mod arithmetic;
-pub use arithmetic::{ArithmeticOps, ElementArithmeticOps, LocalArithmeticOps};
+pub use arithmetic::{
+    ArithmeticOps, ElementArithmeticOps, LocalArithmeticOps, UnsafeArithmeticOps,
+};
 pub(crate) mod bitwise;
-pub use bitwise::{BitWiseOps, ElementBitWiseOps, LocalBitWiseOps};
+pub use bitwise::{BitWiseOps, ElementBitWiseOps, LocalBitWiseOps, UnsafeBitWiseOps};
 pub(crate) mod compare_exchange;
 pub use compare_exchange::{
     CompareExchangeEpsilonOps, CompareExchangeOps, ElementCompareEqOps, ElementComparePartialEqOps,
+    UnsafeCompareExchangeEpsilonOps, UnsafeCompareExchangeOps,
 };
 pub(crate) mod read_only;
-pub use read_only::ReadOnlyOps;
+pub use read_only::{ReadOnlyOps, UnsafeReadOnlyOps};
 pub(crate) mod shift;
-pub use shift::{ElementShiftOps, LocalShiftOps, ShiftOps};
+pub use shift::{ElementShiftOps, LocalShiftOps, ShiftOps, UnsafeShiftOps};
 
 // use async_trait::async_trait;
 // use parking_lot::Mutex;
diff --git a/src/array/operations/access.rs b/src/array/operations/access.rs
index 1aac1b00..8a189fd6 100644
--- a/src/array/operations/access.rs
+++ b/src/array/operations/access.rs
@@ -210,6 +210,212 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     }
 }
 
+#[doc(alias("One-sided", "onesided"))]
+/// The interface for remotely writing elements
+///
+/// These operations can be performed using any [LamellarWriteArray]  type
+///
+/// Both single element operations and batched element operations are provided
+///
+/// Generally if you are performing a large number of operations it will be better to
+/// use a batched version instead of multiple single element opertations. While the
+/// Runtime internally performs message aggregation for both single element and batched
+/// operations, single element operates have to be treated as individual requests, resulting
+/// in allocation and bookkeeping overheads. A single batched call on the other hand is treated
+/// as a single request by the runtime. (See [ReadOnlyOps] for an example comparing single vs batched load operations of a list of indices)
+///
+/// The results of a batched operation are returned to the user in the same order as the input indices.
+///
+/// # One-sided Operation
+/// performing either single or batched operations are both one-sided, with the calling PE performing any necessary work to
+/// initate and execute active messages that are sent to remote PEs.
+/// For Ops that return results, the result will only be available on the calling PE.
+///
+/// # Note
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+///
+/// # Batched Types
+/// Three types of batched operations can be performed
+/// ## One Value - Many Indicies
+/// In this type, the same value will be applied to the provided indices
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let val = 10;
+/// array.block_on(unsafe{array.batch_store(indices,val)});
+///```
+/// ## Many Values - One Index
+/// In this type, multiple values will be applied to the given index
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let vals = vec![3,54,12,88,29,68];
+/// let index = 10;
+/// array.block_on(unsafe{array.batch_store(index,vals)});
+///```
+/// ## Many Values - Many Indicies
+/// In this type, values and indices have a one-to-one correspondance.
+///
+/// If the two lists are unequal in length, the longer of the two will be truncated so that it matches the length of the shorter
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let vals = vec![12,2,1,10000,12,13];
+/// array.block_on(unsafe{array.batch_store(indices,vals)});
+///```
+pub trait UnsafeAccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
+    /// This call stores the supplied `val` into the element specified by `index`
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{array.store(idx,val)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn store<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array()
+            .initiate_batch_op(val, index, ArrayOpCmd::Store, self.as_lamellar_byte_array())
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [store][AccessOps::store] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [AccessOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{array.batch_store(indices,10)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_store<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Store,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call swaps the supplied `val` into the element specified by `index`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let new = 10;
+    /// let req = unsafe{array.swap(idx,new)};
+    /// let old = req.block();
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn swap<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(val, index, ArrayOpCmd::Swap, self.as_lamellar_byte_array())
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [swap][AccessOps::swap] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [AccessOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{array.batch_swap(indices,10)};
+    /// let old_vals = req.block();
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_swap<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::Swap,
+            self.as_lamellar_byte_array(),
+        )
+    }
+}
+
 #[doc(hidden)]
 pub trait LocalAtomicOps<T: Dist + ElementOps> {
     fn local_load(&self, index: usize, val: T) -> T;
diff --git a/src/array/operations/arithmetic.rs b/src/array/operations/arithmetic.rs
index 93dda91d..1db86f5a 100644
--- a/src/array/operations/arithmetic.rs
+++ b/src/array/operations/arithmetic.rs
@@ -843,6 +843,819 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     }
 }
 
+#[doc(alias("One-sided", "onesided"))]
+/// The interface for performing remote arithmetic operations on array elements
+///
+/// These operations can be performed using any [LamellarWriteArray] type
+///
+/// Both single element operations and batched element operations are provided
+///
+/// Generally if you are performing a large number of operations it will be better to
+/// use a batched version instead of multiple single element opertations. While the
+/// Runtime internally performs message aggregation for both single element and batched
+/// operations, single element operates have to be treated as individual requests, resulting
+/// in allocation and bookkeeping overheads. A single batched call on the other hand is treated
+/// as a single request by the runtime. (See [ReadOnlyOps] for an example comparing single vs batched load operations of a list of indices)
+///
+/// The results of a batched operation are returned to the user in the same order as the input indices.
+///
+/// # One-sided Operation
+/// performing either single or batched operations are both one-sided, with the calling PE performing any necessary work to
+/// initate and execute active messages that are sent to remote PEs.
+/// For Ops that return results, the result will only be available on the calling PE.
+///
+/// # Note
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+///
+/// # Batched Types
+/// Three types of batched operations can be performed
+/// ## One Value - Many Indicies
+/// In this type, the same value will be applied to the provided indices
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let val = 10;
+/// array.block_on(unsafe{array.batch_fetch_add(indices,val)});
+///```
+/// ## Many Values - One Index
+/// In this type, multiple values will be applied to the given index
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let vals = vec![3,54,12,88,29,68];
+/// let index = 10;
+/// array.block_on(unsafe{array.batch_sub(index,vals)});
+///```
+/// ## Many Values - Many Indicies
+/// In this type, values and indices have a one-to-one correspondance.
+///
+/// If the two lists are unequal in length, the longer of the two will be truncated so that it matches the length of the shorter
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let vals = vec![12,2,1,10000,12,13];
+/// array.block_on(unsafe{array.batch_fetch_mul(indices,vals)});
+///```
+pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
+    private::LamellarArrayPrivate<T>
+{
+    /// This call adds the supplied `val` into the element specified by `index`
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.add(idx,val) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn add(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Add,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [add][ArithmeticOps::add] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_add(indices,10) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_add<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::Add)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Add,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call adds the supplied `val` into the element specified by `index`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_add(idx,val) };
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_add(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchAdd,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_add][ArithmeticOps::fetch_add] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_add(indices,10) };
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_add<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchAdd,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call subtracts the supplied `val` from the element specified by `index`
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.sub(idx,val) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn sub<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Sub,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [sub][ArithmeticOps::sub] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_sub(indices,10) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_sub<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::Sub)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Sub,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call subtracts the supplied `val` from the element specified by `index`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_sub(idx,val) };
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_sub<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchSub,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_sub][ArithmeticOps::fetch_sub] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_sub(indices,10) };
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_sub<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchSub,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call multiplies the supplied `val` by the element specified by `index` and stores the result.
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.mul(idx,val) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn mul<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Mul,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [mul][ArithmeticOps::mul] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_mul(indices,10) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_mul<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::Mul)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Mul,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call multiplies the supplied `val` with the element specified by `index`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_mul(idx,val) };
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_mul<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchMul,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_mul][ArithmeticOps::fetch_mul] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_mul(indices,10) };
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_mul<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchMul,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call divides the element specified by `index` with the supplied `val` and stores the result
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.div(idx,val) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn div<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Div,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [div][ArithmeticOps::div] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_div(indices,10) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_div<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::Div)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Div,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call divides the element specified by `index` with the supplied `val`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_div(idx,val) };
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_div<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchDiv,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_div][ArithmeticOps::fetch_div] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_div(indices,10) };
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_div<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchDiv,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call divides the element specified by `index` with the supplied `val` and stores the result
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.rem(idx,val) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn rem<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Rem,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [rem][ArithmeticOps::rem] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_rem(indices,10) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_rem<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::Rem)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Rem,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call divides the element specified by `index` with the supplied `val`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_rem(idx,val) };
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_rem<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchRem,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_rem][ArithmeticOps::fetch_rem] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ArithmeticOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_rem(indices,10) };
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_rem<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchRem,
+            self.as_lamellar_byte_array(),
+        )
+    }
+}
+
 #[doc(hidden)]
 pub trait LocalArithmeticOps<T: Dist + ElementArithmeticOps> {
     fn local_add(&self, index: usize, val: T) {
diff --git a/src/array/operations/bitwise.rs b/src/array/operations/bitwise.rs
index 8bae2eda..0c1b7f4d 100644
--- a/src/array/operations/bitwise.rs
+++ b/src/array/operations/bitwise.rs
@@ -532,6 +532,519 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     }
 }
 
+#[doc(alias("One-sided", "onesided"))]
+/// The interface for performing remote bitwise operations on array elements
+///
+/// These operations can be performed using any [LamellarWriteArray] type
+///
+/// Both single element operations and batched element operations are provided
+///
+/// Generally if you are performing a large number of operations it will be better to
+/// use a batched version instead of multiple single element opertations. While the
+/// Runtime internally performs message aggregation for both single element and batched
+/// operations, single element operates have to be treated as individual requests, resulting
+/// in allocation and bookkeeping overheads. A single batched call on the other hand is treated
+/// as a single request by the runtime. (See [ReadOnlyOps] for an example comparing single vs batched load operations of a list of indices)
+///
+/// The results of a batched operation are returned to the user in the same order as the input indices.
+///
+/// # One-sided Operation
+/// performing either single or batched operations are both one-sided, with the calling PE performing any necessary work to
+/// initate and execute active messages that are sent to remote PEs.
+/// For Ops that return results, the result will only be available on the calling PE.
+///
+/// # Note
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+///
+/// # Batched Types
+/// Three types of batched operations can be performed
+/// ## One Value - Many Indicies
+/// In this type, the same value will be applied to the provided indices
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let val = 0b100101001;
+/// array.block_on(unsafe{array.batch_fetch_bit_and(indices,val)});
+///```
+/// ## Many Values - One Index
+/// In this type, multiple values will be applied to the given index
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let vals = vec![0x3,0x54,0b11101,88,29,0x68];
+/// let index = 10;
+/// array.block_on(unsafe{array.batch_bit_or(index,vals)});
+///```
+/// ## Many Values - Many Indicies
+/// In this type, values and indices have a one-to-one correspondance.
+///
+/// If the two lists are unequal in length, the longer of the two will be truncated so that it matches the length of the shorter
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let vals = vec![0x12,2,1,0b10000,12,0x13];
+/// array.block_on(unsafe{array.batch_fetch_bit_or(indices,vals)});
+///```
+pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
+    /// This call performs a bitwise `and` with the element specified by `index` and the supplied `val`.
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 0b100101001;
+    /// let req = unsafe{ array.bit_and(idx,val)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn bit_and<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::And,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [bit_and][BitWiseOps::bit_and] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [BitWiseOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_bit_and(indices,10)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_bit_and<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::And)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::And,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a bitwise `and` with the element specified by `index` and the supplied `val`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_bit_and(idx,val)};
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_bit_and<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchAnd,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_bit_and][BitWiseOps::fetch_bit_and] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [BitWiseOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_bit_and(indices,10)};
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_bit_and<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchAnd,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a bitwise `or` with the element specified by `index` and the supplied `val`.
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 0b100101001;
+    /// let req = unsafe{ array.bit_or(idx,val)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn bit_or<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Or,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [bit_or][BitWiseOps::bit_or] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [BitWiseOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_bit_or(indices,10)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_bit_or<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::Or)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Or,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a bitwise `or` with the element specified by `index` and the supplied `val`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_bit_or(idx,val)};
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_bit_or<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchOr,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_bit_or][BitWiseOps::fetch_bit_or] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [BitWiseOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_bit_or(indices,10)};
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_bit_or<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchOr,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a bitwise `xor` with the element specified by `index` and the supplied `val`.
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 0b100101001;
+    /// let req = unsafe{ array.bit_xor(idx,val)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn bit_xor<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Xor,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [bit_xor][BitWiseOps::bit_xor] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [BitWiseOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_bit_xor(indices,10)};
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_bit_xor<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_op(val, index, ArrayOpCmd::Xor)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Xor,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a bitwise `xor` with the element specified by `index` and the supplied `val`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let req = unsafe{ array.fetch_bit_xor(idx,val)};
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_bit_xor<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchXor,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_bit_xor][BitWiseOps::fetch_bit_xor] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [BitWiseOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_bit_xor(indices,10)};
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_bit_xor<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchXor,
+            self.as_lamellar_byte_array(),
+        )
+    }
+}
+
 #[doc(hidden)]
 pub trait LocalBitWiseOps<T: Dist + ElementBitWiseOps> {
     fn local_bit_and(&self, index: usize, val: T) {
diff --git a/src/array/operations/compare_exchange.rs b/src/array/operations/compare_exchange.rs
index ce3a5edb..93497a82 100644
--- a/src/array/operations/compare_exchange.rs
+++ b/src/array/operations/compare_exchange.rs
@@ -348,3 +348,337 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
         )
     }
 }
+
+#[doc(alias("One-sided", "onesided"))]
+/// The interface for performing remote compare and exchange operations on array elements
+///
+/// These operations can be performed using any [LamellarWriteArray] type
+///
+/// Both single element operations and batched element operations are provided
+///
+/// Generally if you are performing a large number of operations it will be better to
+/// use a batched version instead of multiple single element opertations. While the
+/// Runtime internally performs message aggregation for both single element and batched
+/// operations, single element operates have to be treated as individual requests, resulting
+/// in allocation and bookkeeping overheads. A single batched call on the other hand is treated
+/// as a single request by the runtime. (See [ReadOnlyOps] for an example comparing single vs batched load operations of a list of indices)
+///
+/// The results of a batched operation are returned to the user in the same order as the input indices.
+///
+/// # One-sided Operation
+/// performing either single or batched operations are both one-sided, with the calling PE performing any necessary work to
+/// initate and execute active messages that are sent to remote PEs.
+/// For Ops that return results, the result will only be available on the calling PE.
+///
+/// # Note
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though)
+///
+/// # Batched Types
+/// Three types of batched operations can be performed
+///
+/// Currently only the indicies and new values can be batched, for all the batch types below you can only pass a single `current val` which will be used in each individual operation of the batch
+/// We plan to support batched `current vals` in a future release.
+/// ## One Value - Many Indicies
+/// In this type, the same value will be applied to the provided indices
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let current = 0;
+/// let new = 10;
+/// array.block_on(unsafe{array.batch_compare_exchange(indices,current,new)});
+///```
+/// ## Many Values - One Index
+/// In this type, multiple values will be applied to the given index
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let new_vals = vec![3,54,11101,88,29,68];
+/// let current = 0;
+/// let index = 10;
+/// array.block_on(unsafe{array.batch_compare_exchange(index,current,new_vals)});
+///```
+/// ## Many Values - Many Indicies
+/// In this type, values and indices have a one-to-one correspondance.
+///
+/// If the two lists are unequal in length, the longer of the two will be truncated so that it matches the length of the shorter
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let new_vals = vec![12,2,1,10000,12,13];
+/// let current = 0;
+/// array.block_on(unsafe{array.batch_compare_exchange(indices,current,new_vals)});
+///```
+pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
+    private::LamellarArrayPrivate<T>
+{
+    /// This call stores the `new` value into the element specified by `index` if the current value is the same as `current`.
+    ///
+    /// the return value is a result indicating whether the new value was written into the element and contains the previous value.
+    /// On success this previous value is gauranteed to be equal to `current`
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed and retrieve the returned value.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10;
+    /// let current = 0;
+    /// let req = unsafe{ array.compare_exchange(idx,current,val)};
+    /// let result = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn compare_exchange<'a>(
+        &self,
+        index: usize,
+        current: T,
+        new: T,
+    ) -> ArrayResultOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_result_op_2(
+                new,
+                index,
+                ArrayOpCmd::CompareExchange(current),
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [compare_exchange][CompareExchangeOps::compare_exchange] function,
+    ///
+    /// Instead of a single value and index this function expects a list of (new)`vals`, or a list of `indices` or both.
+    /// Note that presently only a single `current` value can be provided, and will be used for all operations in the batch.
+    /// Please see the general [CompareExchangeOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let current = 0;
+    /// let req = unsafe{ array.batch_compare_exchange(indices,current,10)};
+    /// let results = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_compare_exchange<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        current: T,
+        new: impl OpInput<'a, T>,
+    ) -> ArrayResultBatchOpHandle<T> {
+        self.inner_array().initiate_batch_result_op_2(
+            new,
+            index,
+            ArrayOpCmd::CompareExchange(current),
+            self.as_lamellar_byte_array(),
+        )
+    }
+}
+
+#[doc(alias("One-sided", "onesided"))]
+/// The interface for performing remote compare and exchange operations within a given epsilon on array elements
+///
+/// Useful for element types that only impl [PartialEq][std::cmp::PartialEq] instead of [Eq][std::cmp::Eq] (e.g `f32`,`f64`).
+///
+/// These operations can be performed using any [LamellarWriteArray] type
+///
+/// Both single element operations and batched element operations are provided
+///
+/// Generally if you are performing a large number of operations it will be better to
+/// use a batched version instead of multiple single element opertations. While the
+/// Runtime internally performs message aggregation for both single element and batched
+/// operations, single element operates have to be treated as individual requests, resulting
+/// in allocation and bookkeeping overheads. A single batched call on the other hand is treated
+/// as a single request by the runtime. (See [ReadOnlyOps] for an example comparing single vs batched load operations of a list of indices)
+///
+/// The results of a batched operation are returned to the user in the same order as the input indices.
+///
+/// # One-sided Operation
+/// performing either single or batched operations are both one-sided, with the calling PE performing any necessary work to
+/// initate and execute active messages that are sent to remote PEs.
+/// For Ops that return results, the result will only be available on the calling PE.
+///
+/// # Note
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+///
+/// # Batched Types
+/// Three types of batched operations can be performed
+///
+/// Currently only the indicies and new values can be batched, for all the batch types below you can only pass a single `current val` and a single `epsilon` which will be used in each individual operation of the batch
+/// We plan to support batched `current vals` and `epsilons` in a future release.
+///
+/// ## One Value - Many Indicies
+/// In this type, the same value will be applied to the provided indices
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,11,88,29,68];
+/// let current = 0.0;
+/// let new = 10.5;
+/// let epsilon = 0.1;
+/// array.block_on(unsafe{array.batch_compare_exchange_epsilon(indices,current,new,epsilon)});
+///```
+/// ## Many Values - One Index
+/// In this type, multiple values will be applied to the given index
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+///
+/// let new_vals = vec![3.0,54.8,12.9,88.1,29.2,68.9];
+/// let current = 0.0;
+/// let index = 10;
+/// let epsilon = 0.1;
+/// array.block_on(unsafe{array.batch_compare_exchange_epsilon(index,current,new_vals,epsilon)});
+///```
+/// ## Many Values - Many Indicies
+/// In this type, values and indices have a one-to-one correspondance.
+///
+/// If the two lists are unequal in length, the longer of the two will be truncated so that it matches the length of the shorter
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let new_vals = vec![12.1,2.321,1.7,10000.0,12.4,13.7];
+/// let current = 0.0;
+/// let epsilon = 0.1;
+/// array.block_on(unsafe{array.batch_compare_exchange_epsilon(indices,current,new_vals,epsilon)});
+///```
+pub trait UnsafeCompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
+    private::LamellarArrayPrivate<T>
+{
+    /// This call stores the `new` value into the element specified by `index` if the current value is the same as `current` plus or minus `epslion`.
+    ///
+    /// e.g. ``` if current - epsilon < array[index] && array[index] < current + epsilon { array[index] = new }```
+    ///
+    /// The return value is a result indicating whether the new value was written into the element and contains the previous value.
+    /// On success this previous value is gauranteed to be within epsilon of `current`
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed and retrieve the returned value.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 10.3;
+    /// let current = 0.0;
+    /// let epsilon = 0.1;
+    /// let req = unsafe{ array.compare_exchange_epsilon(idx,current,val,epsilon)};
+    /// let result = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn compare_exchange_epsilon<'a>(
+        &self,
+        index: usize,
+        current: T,
+        new: T,
+        eps: T,
+    ) -> ArrayResultOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_result_op_2(
+                new,
+                index,
+                ArrayOpCmd::CompareExchangeEps(current, eps),
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [compare_exchange_epsilon][CompareExchangeEpsilonOps::compare_exchange_epsilon] function,
+    ///
+    /// Instead of a single value and index this function expects a list of (new)`vals`, or a list of `indices` or both.
+    /// Note that presently only a single `current` value and a single `epsilon` value can be provided, and they will be used for all operations in the batch.
+    /// Please see the general [CompareExchangeEpsilonOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let current = 0.0;
+    /// let epsilon = 0.001;
+    /// let req = unsafe{ array.batch_compare_exchange_epsilon(indices,current,10.321,epsilon)};
+    /// let results = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_compare_exchange_epsilon<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        current: T,
+        new: impl OpInput<'a, T>,
+        eps: T,
+    ) -> ArrayResultBatchOpHandle<T> {
+        self.inner_array().initiate_batch_result_op_2(
+            new,
+            index,
+            ArrayOpCmd::CompareExchangeEps(current, eps),
+            self.as_lamellar_byte_array(),
+        )
+    }
+}
diff --git a/src/array/operations/read_only.rs b/src/array/operations/read_only.rs
index ac2f7dc1..ee4c9c83 100644
--- a/src/array/operations/read_only.rs
+++ b/src/array/operations/read_only.rs
@@ -122,3 +122,124 @@ pub trait ReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
         )
     }
 }
+
+#[doc(alias("One-sided", "onesided"))]
+/// The interface for remotely reading elements
+///
+/// These operations can be performed using any LamellarArray type.
+///
+/// Both single element operations and batched element operations are provided
+///
+/// Generally if you are performing a large number of operations it will be better to
+/// use a batched version instead of multiple single element opertations. While the
+/// Runtime internally performs message aggregation for both single element and batched
+/// operations, single element operations have to be treated as individual requests, resulting
+/// in allocation and bookkeeping overheads. A single batched call on the other hand is treated
+/// as a single request by the runtime.
+///
+/// The results of a batched operation are returned to the user in the same order as the input indices.
+///
+/// # One-sided Operation
+/// performing either single or batched operations are both one-sided, with the calling PE performing any necessary work to
+/// initate and execute active messages that are sent to remote PEs.
+/// For Ops that return results, the result will only be available on the calling PE.
+///
+/// # Note
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur
+///
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// use futures_util::future::join_all;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// let reqs = indices.iter().map(|i| unsafe{array.load(*i)}).collect::<Vec<_>>();
+/// let vals_1 = array.block_on(async move {
+///     // reqs.into_iter().map(|req| req.await).collect::<Vec<_>>()
+///     join_all(reqs).await
+/// });
+/// let req = unsafe{array.batch_load(indices)};
+/// let vals_2 = array.block_on(req);
+/// for (v1,v2) in vals_1.iter().zip(vals_2.iter()){
+///     assert_eq!(v1,v2);
+/// }
+///```
+pub trait UnsafeReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
+    /// This call returns the value of the element at the specified index
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation as finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let req = unsafe{ array.load(53)};
+    /// let val = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn load<'a>(&self, index: usize) -> ArrayFetchOpHandle<T> {
+        let dummy_val = self.inner_array().dummy_val(); //we dont actually do anything with this except satisfy apis;
+                                                        // let array = self.inner_array();
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                dummy_val,
+                index,
+                ArrayOpCmd::Load,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [load][ReadOnlyOps::load] function,
+    /// return a vector of values rather than a single value.
+    ///
+    /// Instead of a single index, this function expects a list of indicies to load
+    /// (See the [OpInput] documentation for a description of valid input containers)
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_load(indices.clone())};
+    /// let vals = array.block_on(req);
+    /// assert_eq!(vals.len(),indices.len());
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_load<'a>(&self, index: impl OpInput<'a, usize>) -> ArrayFetchBatchOpHandle<T> {
+        let dummy_val = self.inner_array().dummy_val(); //we dont actually do anything with this except satisfy apis;
+        self.inner_array().initiate_batch_fetch_op_2(
+            dummy_val,
+            index,
+            ArrayOpCmd::Load,
+            self.as_lamellar_byte_array(),
+        )
+    }
+}
diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index d60673c7..a477cb6c 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -350,6 +350,306 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     }
 }
 
+pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
+    /// This call performs an in place left shift of `val` bits on the element specified by `index`.
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,4,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 2;
+    /// let req = unsafe{ array.shl(idx,val) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn shl(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Shl,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [shl][ShiftOps::shl] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ShiftOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_shl(indices,3) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_shl<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_batch_op(val, index, ArrayOpCmd::Shl)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Shl,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs an in place left shift of `val` bits on the element specified by `index`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 2;
+    /// let req = unsafe{ array.fetch_shl(idx,val) };
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_shl(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchShl,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_shl][ShiftOps::fetch_shl] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ShiftOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_shl(indices,10) };
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_shl<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchShl,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs an in place right shift of `val` bits on the element specified by `index`.
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,4,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 2;
+    /// let req = unsafe{ array.shl(idx,val) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn shr<'a>(&self, index: usize, val: T) -> ArrayOpHandle {
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Shr,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs a batched vesion of the [shl][ShiftOps::shl] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ShiftOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to detect when the operation has completed
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_shr(indices,3) };
+    /// array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_shr<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayBatchOpHandle {
+        // self.inner_array().initiate_batch_op(val, index, ArrayOpCmd::Shr)
+        self.inner_array().initiate_batch_op(
+            val,
+            index,
+            ArrayOpCmd::Shr,
+            self.as_lamellar_byte_array(),
+        )
+    }
+
+    /// This call performs an in place right shift of `val` bits on the element specified by `index`, returning the old value
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the result after the (possibly remote) operation has finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to retrieving the result, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let idx = 53;
+    /// let val = 2;
+    /// let req = unsafe{ array.fetch_shl(idx,val) };
+    /// let old = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn fetch_shr<'a>(&self, index: usize, val: T) -> ArrayFetchOpHandle<T> {
+        self.inner_array()
+            .initiate_batch_fetch_op_2(
+                val,
+                index,
+                ArrayOpCmd::FetchShr,
+                self.as_lamellar_byte_array(),
+            )
+            .into()
+    }
+
+    /// This call performs a batched vesion of the [fetch_shr][ShiftOps::fetch_shr] function,
+    ///
+    /// Instead of a single value and index this function expects a list of `vals`, or a list of `indices` or both.
+    /// Please see the general [ShiftOps] documentation for more information on batch operation input
+    ///
+    /// A future is returned as the result of this call, which is used to retrieve
+    /// the results after the (possibly remote) operations have finished.
+    ///
+    /// # Note
+    /// This future is only lazy with respect to checking for completion, not
+    /// with respect to launching the operation. That is, the operation will
+    /// occur regardless of if the future is ever polled or not, Enabling
+    /// a "fire and forget" programming model.
+    ///
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    ///
+    /// let indices = vec![3,54,12,88,29,68];
+    /// let req = unsafe{ array.batch_fetch_shr(indices,10) };
+    /// let old_vals = array.block_on(req);
+    ///```
+    //#[tracing::instrument(skip_all)]
+    unsafe fn batch_fetch_shr<'a>(
+        &self,
+        index: impl OpInput<'a, usize>,
+        val: impl OpInput<'a, T>,
+    ) -> ArrayFetchBatchOpHandle<T> {
+        self.inner_array().initiate_batch_fetch_op_2(
+            val,
+            index,
+            ArrayOpCmd::FetchShr,
+            self.as_lamellar_byte_array(),
+        )
+    }
+}
+
 #[doc(hidden)]
 pub trait LocalShiftOps<T: Dist + ElementShiftOps> {
     fn local_shl(&self, index: usize, val: T) {
diff --git a/src/array/prelude.rs b/src/array/prelude.rs
index 13075df6..055c900e 100644
--- a/src/array/prelude.rs
+++ b/src/array/prelude.rs
@@ -32,7 +32,7 @@ pub use crate::array::operations::{
     AccessOps, ArithmeticOps, ArrayOps as _ArrayOps, BitWiseOps, CompareExchangeEpsilonOps,
     CompareExchangeOps, ElementArithmeticOps, ElementBitWiseOps, ElementCompareEqOps,
     ElementComparePartialEqOps, ElementOps, ElementShiftOps, LocalArithmeticOps, LocalAtomicOps,
-    LocalBitWiseOps, LocalShiftOps, OpInput, ReadOnlyOps, ShiftOps,
+    LocalBitWiseOps, LocalShiftOps, OpInput, ReadOnlyOps, ShiftOps, UnsafeAccessOps, UnsafeArithmeticOps, UnsafeBitWiseOps, UnsafeCompareExchangeEpsilonOps, UnsafeCompareExchangeOps, UnsafeShiftOps, UnsafeReadOnlyOps
 };
 // pub use crate::array::operations::*;
 
diff --git a/src/array/unsafe/local_chunks.rs b/src/array/unsafe/local_chunks.rs
index d65a8a4f..8bbbf6fe 100644
--- a/src/array/unsafe/local_chunks.rs
+++ b/src/array/unsafe/local_chunks.rs
@@ -190,7 +190,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// array.wait_all();
     ///
     /// ```
-    pub fn local_chunks(&self, chunk_size: usize) -> UnsafeLocalChunks<T> {
+    pub unsafe fn local_chunks(&self, chunk_size: usize) -> UnsafeLocalChunks<T> {
         UnsafeLocalChunks {
             chunk_size,
             index: 0,
@@ -216,7 +216,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// array.wait_all();
     ///
     /// ```
-    pub fn local_chunks_mut(&self, chunk_size: usize) -> UnsafeLocalChunksMut<T> {
+    pub unsafe fn local_chunks_mut(&self, chunk_size: usize) -> UnsafeLocalChunksMut<T> {
         UnsafeLocalChunksMut {
             chunk_size,
             index: 0,
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index ef724b8c..7a67766f 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -976,16 +976,19 @@ impl MultiValMultiIndex {
     }
 }
 
-impl<T: ElementOps + 'static> ReadOnlyOps<T> for UnsafeArray<T> {}
+impl<T: ElementOps + 'static> UnsafeReadOnlyOps<T> for UnsafeArray<T> {}
 
-impl<T: ElementOps + 'static> AccessOps<T> for UnsafeArray<T> {}
+impl<T: ElementOps + 'static> UnsafeAccessOps<T> for UnsafeArray<T> {}
 
-impl<T: ElementArithmeticOps + 'static> ArithmeticOps<T> for UnsafeArray<T> {}
+impl<T: ElementArithmeticOps + 'static> UnsafeArithmeticOps<T> for UnsafeArray<T> {}
 
-impl<T: ElementBitWiseOps + 'static> BitWiseOps<T> for UnsafeArray<T> {}
+impl<T: ElementBitWiseOps + 'static> UnsafeBitWiseOps<T> for UnsafeArray<T> {}
 
-impl<T: ElementShiftOps + 'static> ShiftOps<T> for UnsafeArray<T> {}
+impl<T: ElementShiftOps + 'static> UnsafeShiftOps<T> for UnsafeArray<T> {}
 
-impl<T: ElementCompareEqOps + 'static> CompareExchangeOps<T> for UnsafeArray<T> {}
+impl<T: ElementCompareEqOps + 'static> UnsafeCompareExchangeOps<T> for UnsafeArray<T> {}
 
-impl<T: ElementComparePartialEqOps + 'static> CompareExchangeEpsilonOps<T> for UnsafeArray<T> {}
+impl<T: ElementComparePartialEqOps + 'static> UnsafeCompareExchangeEpsilonOps<T>
+    for UnsafeArray<T>
+{
+}
diff --git a/src/barrier.rs b/src/barrier.rs
index c73f04e1..1b88b2f6 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -139,7 +139,7 @@ impl Barrier {
             Note that barriers are often called internally for many collective operations, including constructing new LamellarTeams, LamellarArrays, and Darcs, as well as distributed iteration\n\
             You may be seeing this message if you have called barrier within an async context (meaning it was executed on a worker thread).\n\
             A full list of collective operations is found at https://docs.rs/lamellar/latest/lamellar/index.html?search=collective\n\
-            The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+            The deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
             To view backtrace set RUST_LIB_BACKTRACE=1\n\
         {}",
         std::thread::current().id()
diff --git a/src/darc.rs b/src/darc.rs
index 5efcbed7..2d0dcc67 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -491,7 +491,7 @@ impl<T: 'static> DarcInner<T> {
                         The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
                         The object is likely a {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
                         An example where this can occur can be found at https://docs.rs/lamellar/latest/lamellar/array/struct.ReadOnlyArray.html#method.into_local_lock\n\
-                        The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                        The deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
                         To view backtrace set RUST_LIB_BACKTRACE=1\n\
                         {}",
                         inner.my_pe,
@@ -1832,7 +1832,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                     println!("[{:?}][WARNING] -- Potential deadlock detected when trying to free distributed object.\n\
                         The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
                         The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
-                        the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                        the deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
                         To view backtrace set RUST_LIB_BACKTRACE=1\n\
                         {}",
                         std::thread::current().id(),
@@ -1872,7 +1872,7 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                     println!("[{:?}][WARNING] --- Potential deadlock detected when trying to free distributed object.\n\
                         The runtime is currently waiting for all remaining references to this distributed object to be dropped.\n\
                         The current status of the object on each pe is {:?} with {:?} remaining local references and {:?} remaining remote references, ref cnts by pe {ref_cnts_slice:?}\n\
-                        the deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                        the deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
                         To view backtrace set RUST_LIB_BACKTRACE=1\n\
                         {}",
                         std::thread::current().id(),
diff --git a/src/env_var.rs b/src/env_var.rs
index ccb96489..3553ee77 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -43,7 +43,7 @@
 use serde::Deserialize;
 use std::sync::OnceLock;
 
-fn default_deadlock_timeout() -> f64 {
+fn default_deadlock_warning_timeout() -> f64 {
     600.0
 }
 
diff --git a/src/lamellar_alloc.rs b/src/lamellar_alloc.rs
index c1a5aa46..dd16c019 100644
--- a/src/lamellar_alloc.rs
+++ b/src/lamellar_alloc.rs
@@ -279,7 +279,7 @@ impl LamellarAlloc for BTreeAlloc {
             val = self.try_malloc(size, align);
             if timer.elapsed().as_secs_f64() > config().deadlock_timeout {
                 println!("[WARNING]  Potential deadlock detected when trying to allocate more memory.\n\
-                The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                The deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
                 To view backtrace set RUST_LIB_BACKTRACE=1\n\
                 {}",config().deadlock_timeout,std::backtrace::Backtrace::capture());
                 timer = std::time::Instant::now();
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 28a2a189..424701b0 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -650,8 +650,8 @@ impl From<LamellarWorld> for IntoLamellarTeam {
     }
 }
 
-// Intenal Runtime handle to a lamellar team
-// users generally don't need to use this
+/// Intenal Runtime handle to a lamellar team
+/// users generally don't need to use this
 #[doc(hidden)]
 pub struct ArcLamellarTeam {
     pub team: Arc<LamellarTeam>,
@@ -722,9 +722,8 @@ impl From<Pin<Arc<LamellarTeamRT>>> for LamellarTeamRemotePtr {
 }
 
 /// Internal Runtime handle to a lamellar team
-/// this is typicallyused by proc macros (hence why it is public)
+/// this is typically used by proc macros (hence why it is public)
 /// end users should never use this directly and should instead use the [LamellarTeam] and/or [LamellarWorld] struct
-#[doc(hidden)]
 pub struct LamellarTeamRT {
     #[allow(dead_code)]
     pub(crate) world: Option<Pin<Arc<LamellarTeamRT>>>,
@@ -1017,15 +1016,14 @@ impl LamellarTeamRT {
         // println!("team destroyed")
     }
     #[allow(dead_code)]
-    //#[tracing::instrument(skip_all)]
     pub fn get_pes(&self) -> Vec<usize> {
         self.arch.team_iter().collect::<Vec<usize>>()
     }
-    //#[tracing::instrument(skip_all)]
+
     pub fn world_pe_id(&self) -> usize {
         self.world_pe
     }
-    //#[tracing::instrument(skip_all)]
+
     pub fn team_pe_id(&self) -> Result<usize, IdError> {
         self.arch.team_pe(self.world_pe)
     }
@@ -1215,7 +1213,7 @@ impl LamellarTeamRT {
                         println!("[WARNING]  Potential deadlock detected when trying construct a new LamellarTeam.\n\
                         Creating a team is a collective operation requiring all PEs associated with the Parent Team (or LamellarWorld) to enter the call, not just the PEs that will be part of the new team.\n\
                         The following indicates which PEs have not entered the call: {:?}\n\
-                        The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                        The deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
                         To view backtrace set RUST_LIB_BACKTRACE=1\n\
                         {}",status,config().deadlock_timeout,std::backtrace::Backtrace::capture()
                     );
@@ -1303,7 +1301,7 @@ impl LamellarTeamRT {
                     if s.elapsed().as_secs_f64() > config().deadlock_timeout {
                         println!("[WARNING]  Potential deadlock detected when trying to drop a LamellarTeam.\n\
                             The following indicates the dropped status on each PE: {:?}\n\
-                            The deadlock timeout can be set via the LAMELLAR_DEADLOCK_TIMEOUT environment variable, the current timeout is {} seconds\n\
+                            The deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
                             To view backtrace set RUST_LIB_BACKTRACE=1\n\
                             {}",
                             self.dropped.as_slice(),
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 64c016e2..f98f640b 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -105,7 +105,8 @@ macro_rules! add_test{
 
             for idx in 0..array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{ array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn()};
                 }
             }
             array.wait_all();
@@ -134,7 +135,8 @@ macro_rules! add_test{
             indices.shuffle(&mut rng);
             for idx in indices.iter() {//0..num_updates{
                 // let idx = rand_idx.sample(&mut rng);
-                let _ = array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
+                #[allow(unused_unsafe)]
+                let _ = unsafe{ array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn()};
             }
             array.wait_all();
             array.barrier();
@@ -169,7 +171,8 @@ macro_rules! add_test{
             sub_array.barrier();
             for idx in 0..sub_array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ =   unsafe{ sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn()};
                 }
             }
             sub_array.wait_all();
@@ -195,7 +198,8 @@ macro_rules! add_test{
             indices.shuffle(&mut rng);
             for idx in indices.iter(){ // in 0..num_updates{
                 // let idx = rand_idx.sample(&mut rng);
-                let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
+                #[allow(unused_unsafe)]
+                let _ =  unsafe{ sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn()};
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -231,7 +235,8 @@ macro_rules! add_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(pe_max_val as usize){
-                        let _ = sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
+                        #[allow(unused_unsafe)]
+                        let _ =  unsafe{ sub_array.add(idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn()};
                     }
                 }
                 sub_array.wait_all();
@@ -257,7 +262,8 @@ macro_rules! add_test{
                 indices.shuffle(&mut rng);
                 for idx in indices.iter() {//0..num_updates{
                     // let idx = rand_idx.sample(&mut rng);
-                    let _ = sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ =  unsafe{ sub_array.add(*idx,(10_usize.pow((my_pe*2)as u32)) as $t).spawn()};
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
@@ -349,51 +355,59 @@ macro_rules! input_test{
             input_array.print();
             //individual T------------------------------
             for i in 0..array.len(){
-                let _ = array.batch_add(i,1).spawn();
+                #[allow(unused_unsafe)]
+                let _ =  unsafe{ array.batch_add(i,1).spawn()};
             }
             check_results!($array,array,num_pes,"T");
             println!("passed T");
             //individual T------------------------------
             for i in 0..array.len(){
-                let _ = array.batch_add(&i,1).spawn();
+                #[allow(unused_unsafe)]
+                let _ =  unsafe{ array.batch_add(&i,1).spawn()};
             }
             check_results!($array,array,num_pes,"&T");
             println!("passed &T");
             //&[T]------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
             let slice = &vec[..];
-            let _ = array.batch_add(slice,1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(slice,1).spawn()};
             check_results!($array,array,num_pes,"&[T]");
             println!("passed &[T]");
             //scoped &[T]------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
                 let slice = &vec[..];
-                let _ = array.batch_add(slice,1).spawn();
+                #[allow(unused_unsafe)]
+                let _ =  unsafe{ array.batch_add(slice,1).spawn()};
             }
             check_results!($array,array,num_pes,"scoped &[T]");
             println!("passed scoped &[T]");
             // Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
-            let _ = array.batch_add(vec,1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(vec,1).spawn()};
             check_results!($array,array,num_pes,"Vec<T>");
             println!("passed Vec<T>");
             // &Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
-            let _ = array.batch_add(&vec,1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(&vec,1).spawn()};
             check_results!($array,array,num_pes,"&Vec<T>");
             println!("passed &Vec<T>");
             // Scoped Vec<T>------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                let _ = array.batch_add(vec,1).spawn();
+                #[allow(unused_unsafe)]
+                let _ =  unsafe{ array.batch_add(vec,1).spawn()};
             }
             check_results!($array,array,num_pes,"scoped Vec<T>");
             println!("passed scoped Vec<T>");
             // Scoped &Vec<T>------------------------------
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                let _ = array.batch_add(&vec,1).spawn();
+                #[allow(unused_unsafe)]
+                let _ =  unsafe{ array.batch_add(&vec,1).spawn()};
             }
             check_results!($array,array,num_pes,"scoped &Vec<T>");
             println!("passed scoped &Vec<T>");
@@ -430,7 +444,8 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"UnsafeArray<T>");
             // UnsafeArray<T>------------------------------
-            let _ = array.batch_add(unsafe{input_array.local_data()},1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(unsafe{input_array.local_data()},1).spawn()};
             check_results!($array,array,num_pes,"&UnsafeArray<T>");
             println!("passed &UnsafeArray<T>");
 
@@ -439,7 +454,8 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"ReadOnlyArray<T>");
             // ReadOnlyArray<T>------------------------------
-            let _ = array.batch_add(input_array.local_data(),1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(input_array.local_data(),1).spawn()};
             check_results!($array,array,num_pes,"&ReadOnlyArray<T>");
             println!("passed &ReadOnlyArray<T>");
 
@@ -448,7 +464,8 @@ macro_rules! input_test{
             // array.add(input_array.clone(),1);
             // check_results!($array,array,num_pes,"AtomicArray<T>");
             // AtomicArray<T>------------------------------
-            let _ = array.batch_add(&input_array.local_data(),1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(&input_array.local_data(),1).spawn()};
             check_results!($array,array,num_pes,"&AtomicArray<T>");
             println!("passed &AtomicArray<T>");
 
@@ -457,7 +474,8 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
-            let _ = array.batch_add(&input_array.blocking_read_local_data(),1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(&input_array.blocking_read_local_data(),1).spawn()};
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -466,7 +484,8 @@ macro_rules! input_test{
             //  array.add(input_array.clone(),1);
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
-            let _ = array.batch_add(&input_array.blocking_read_local_data(),1).spawn();
+            #[allow(unused_unsafe)]
+            let _ =  unsafe{ array.batch_add(&input_array.blocking_read_local_data(),1).spawn()};
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index 2b4a3229..2f7bdae9 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -96,7 +96,8 @@ macro_rules! div_test{
             // array.print();
             for idx in 0..array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ = array.div(idx,2 as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ =  unsafe{ array.div(idx,2 as $t).spawn()};
                 }
             }
             array.wait_all();
@@ -123,7 +124,8 @@ macro_rules! div_test{
             // // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ = sub_array.div(idx,2 as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ =  unsafe{ sub_array.div(idx,2 as $t).spawn()};
                 }
             }
             sub_array.wait_all();
@@ -149,7 +151,8 @@ macro_rules! div_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(max_updates as usize){
-                        let _ = sub_array.div(idx,2 as $t).spawn();
+                        #[allow(unused_unsafe)]
+                        let _ =  unsafe{ sub_array.div(idx,2 as $t).spawn()};
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 8e348c40..0aeb3a16 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -133,7 +133,8 @@ macro_rules! fetch_add_test{
             for idx in 0..array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(pe_max_val as usize){
-                    reqs.push(array.fetch_add(idx,1 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push( unsafe{ array.fetch_add(idx,1 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -165,7 +166,8 @@ macro_rules! fetch_add_test{
             // println!("2------------");
             for _i in 0..num_updates{
                 let idx = rand_idx.sample(&mut rng);
-                reqs.push((array.fetch_add(idx,1 as $t),idx))
+                #[allow(unused_unsafe)]
+                reqs.push(( unsafe{ array.fetch_add(idx,1 as $t)},idx))
             }
             for (req,_idx) in reqs{
                 let _val =  world.block_on(req) as usize;
@@ -194,7 +196,8 @@ macro_rules! fetch_add_test{
             for idx in 0..sub_array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(pe_max_val as usize){
-                    reqs.push(sub_array.fetch_add(idx,1 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push( unsafe{ sub_array.fetch_add(idx,1 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -225,7 +228,8 @@ macro_rules! fetch_add_test{
             let mut reqs = vec![];
             for _i in 0..num_updates{
                 let idx = rand_idx.sample(&mut rng);
-                reqs.push((sub_array.fetch_add(idx,1 as $t),idx))
+                #[allow(unused_unsafe)]
+                reqs.push(( unsafe{ sub_array.fetch_add(idx,1 as $t)},idx))
             }
             for (req,_idx) in reqs{
                 let _val =  world.block_on(req) as usize;
@@ -257,7 +261,8 @@ macro_rules! fetch_add_test{
                 for idx in 0..sub_array.len(){
                     let mut reqs = vec![];
                     for _i in 0..(pe_max_val as usize){
-                        reqs.push(sub_array.fetch_add(idx,1 as $t));
+                        #[allow(unused_unsafe)]
+                        reqs.push( unsafe{ sub_array.fetch_add(idx,1 as $t)});
                     }
                     #[allow(unused_mut)]
                     let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -288,7 +293,8 @@ macro_rules! fetch_add_test{
                 let mut reqs = vec![];
                 for _i in 0..num_updates{
                     let idx = rand_idx.sample(&mut rng);
-                    reqs.push((sub_array.fetch_add(idx,1 as $t),idx))
+                    #[allow(unused_unsafe)]
+                    reqs.push(( unsafe{ sub_array.fetch_add(idx,1 as $t)},idx))
                 }
                 for (req,_idx) in reqs{
                     let _val =  world.block_on(req) as usize;
@@ -444,13 +450,15 @@ macro_rules! input_test{
             //individual T------------------------------
             let mut reqs = vec![];
             for i in 0..array.len(){
-                reqs.push(array.batch_fetch_add(i,1));
+                #[allow(unused_unsafe)]
+                reqs.push( unsafe{ array.batch_fetch_add(i,1)});
             }
             check_results!($array,array,num_pes,reqs,"T");
             //individual T------------------------------
             let mut reqs = vec![];
             for i in 0..array.len(){
-                reqs.push(array.batch_fetch_add(&i,1));
+                #[allow(unused_unsafe)]
+                reqs.push( unsafe{ array.batch_fetch_add(&i,1)});
             }
             check_results!($array,array,num_pes,reqs,"&T");
             //&[T]------------------------------
@@ -461,14 +469,17 @@ macro_rules! input_test{
             let vals_slice = &vals[..];
 
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(idx_slice,1));
+            #[allow(unused_unsafe)]
+            reqs.push( unsafe{ array.batch_fetch_add(idx_slice,1)});
             check_results!($array,array,num_pes,reqs,"&[T]");
             // single_idx multi_ val
-            reqs.push(array.batch_fetch_add(_my_pe,&vals));
+            #[allow(unused_unsafe)]
+            reqs.push( unsafe{ array.batch_fetch_add(_my_pe,&vals)});
             let real_val = array.len();
             check_results!($array,array,num_pes, real_val,reqs,"&[T]");
             // multi_idx multi_ val
-            reqs.push(array.batch_fetch_add(idx_slice,vals_slice));
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe{array.batch_fetch_add(idx_slice,vals_slice)});
 
             check_results!($array,array,num_pes,reqs,"&[T]");
             //scoped &[T]------------------------------
@@ -476,31 +487,36 @@ macro_rules! input_test{
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
                 let slice = &vec[..];
-                reqs.push(array.batch_fetch_add(slice,1));
+                #[allow(unused_unsafe)]
+                reqs.push( unsafe{ array.batch_fetch_add(slice,1)});
             }
             check_results!($array,array,num_pes,reqs,"scoped &[T]");
             // Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(vec,1));
+            #[allow(unused_unsafe)]
+            reqs.push( unsafe{ array.batch_fetch_add(vec,1)});
             check_results!($array,array,num_pes,reqs,"Vec<T>");
             // &Vec<T>------------------------------
             let mut reqs = vec![];
             let vec=(0..array.len()).collect::<Vec<usize>>();
-            reqs.push(array.batch_fetch_add(&vec,1));
+            #[allow(unused_unsafe)]
+            reqs.push( unsafe{ array.batch_fetch_add(&vec,1)});
             check_results!($array,array,num_pes,reqs,"&Vec<T>");
             // Scoped Vec<T>------------------------------
             let mut reqs = vec![];
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                reqs.push(array.batch_fetch_add(vec,1));
+                #[allow(unused_unsafe)]
+                reqs.push( unsafe{ array.batch_fetch_add(vec,1)});
             }
             check_results!($array,array,num_pes,reqs,"scoped Vec<T>");
             // Scoped &Vec<T>------------------------------
             let mut reqs = vec![];
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
-                reqs.push(array.batch_fetch_add(&vec,1));
+                #[allow(unused_unsafe)]
+                reqs.push( unsafe{ array.batch_fetch_add(&vec,1)});
             }
             check_results!($array,array,num_pes,reqs,"scoped &Vec<T>");
 
@@ -534,7 +550,8 @@ macro_rules! input_test{
             // check_results!($array,array,num_pes,reqs,"UnsafeArray<T>");
             // UnsafeArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(unsafe{input_array.local_data()},1));
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1)});
             check_results!($array,array,num_pes,reqs,"&UnsafeArray<T>");
 
             // ReadOnlyArray<T>------------------------------
@@ -545,7 +562,8 @@ macro_rules! input_test{
             // check_results!($array,array,num_pes,reqs,"ReadOnlyArray<T>");
             // ReadOnlyArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(input_array.local_data(),1));
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1)});
             check_results!($array,array,num_pes,reqs,"&ReadOnlyArray<T>");
 
             // AtomicArray<T>------------------------------
@@ -556,7 +574,8 @@ macro_rules! input_test{
             // check_results!($array,array,num_pes,reqs,"AtomicArray<T>");
             // AtomicArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&input_array.local_data(),1));
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe{array.batch_fetch_add(&input_array.local_data(),1)});
             check_results!($array,array,num_pes,reqs,"&AtomicArray<T>");
 
             // LocalLockArray<T>------------------------------
@@ -569,7 +588,8 @@ macro_rules! input_test{
             let mut reqs = vec![];
             let local_data = input_array.blocking_read_local_data();
             // println!("local lock array len: {:?}", local_data.deref());
-            reqs.push(array.batch_fetch_add(&local_data,1));
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe{array.batch_fetch_add(&local_data,1)});
             drop(local_data);
             check_results!($array,array,num_pes,reqs,"&LocalLockArray<T>");
 
@@ -581,7 +601,8 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
             let mut reqs = vec![];
-            reqs.push(array.batch_fetch_add(&input_array.blocking_read_local_data(),1));
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe{array.batch_fetch_add(&input_array.blocking_read_local_data(),1)});
             check_results!($array,array,num_pes,reqs,"&GlobalLockArray<T>");
        }
     }
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index 569e67fe..d539dace 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -127,7 +127,8 @@ macro_rules! fetch_div_test{
             for idx in 0..array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(max_updates as usize){
-                    reqs.push(array.fetch_div(idx,2 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{array.fetch_div(idx,2 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -166,7 +167,8 @@ macro_rules! fetch_div_test{
             for idx in 0..sub_array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(max_updates as usize){
-                    reqs.push(sub_array.fetch_div(idx,2 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{sub_array.fetch_div(idx,2 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -204,7 +206,8 @@ macro_rules! fetch_div_test{
                 for idx in 0..sub_array.len(){
                     let mut reqs = vec![];
                     for _i in 0..(max_updates as usize){
-                        reqs.push(sub_array.fetch_div(idx,2 as $t));
+                        #[allow(unused_unsafe)]
+                        reqs.push(unsafe{sub_array.fetch_div(idx,2 as $t)});
                     }
                     #[allow(unused_mut)]
                     let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index 8862733e..21c0b41d 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -121,7 +121,8 @@ macro_rules! fetch_mul_test{
             for idx in 0..array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(max_updates as usize){
-                    reqs.push(array.fetch_mul(idx,2 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{array.fetch_mul(idx,2 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -159,7 +160,8 @@ macro_rules! fetch_mul_test{
             for idx in 0..sub_array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(max_updates as usize){
-                    reqs.push(sub_array.fetch_mul(idx,2 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{sub_array.fetch_mul(idx,2 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128>  = std::collections::HashSet::new();
@@ -195,7 +197,8 @@ macro_rules! fetch_mul_test{
                 for idx in 0..sub_array.len(){
                     let mut reqs = vec![];
                     for _i in 0..(max_updates as usize){
-                        reqs.push(sub_array.fetch_mul(idx,2 as $t));
+                        #[allow(unused_unsafe)]
+                        reqs.push(unsafe{sub_array.fetch_mul(idx,2 as $t)});
                     }
                     #[allow(unused_mut)]
                     let mut prevs: std::collections::HashSet<u128>  = std::collections::HashSet::new();
diff --git a/tests/array/arithmetic_ops/fetch_rem_test.rs b/tests/array/arithmetic_ops/fetch_rem_test.rs
index e25b819f..888572d2 100644
--- a/tests/array/arithmetic_ops/fetch_rem_test.rs
+++ b/tests/array/arithmetic_ops/fetch_rem_test.rs
@@ -127,7 +127,8 @@ macro_rules! fetch_rem_test{
             for idx in 0..array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(max_updates as usize){
-                    reqs.push(array.fetch_rem(idx,2 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{array.fetch_rem(idx,2 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -166,7 +167,8 @@ macro_rules! fetch_rem_test{
             for idx in 0..sub_array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(max_updates as usize){
-                    reqs.push(sub_array.fetch_rem(idx,2 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{sub_array.fetch_rem(idx,2 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -204,7 +206,8 @@ macro_rules! fetch_rem_test{
                 for idx in 0..sub_array.len(){
                     let mut reqs = vec![];
                     for _i in 0..(max_updates as usize){
-                        reqs.push(sub_array.fetch_rem(idx,2 as $t));
+                        #[allow(unused_unsafe)]
+                        reqs.push(unsafe{sub_array.fetch_rem(idx,2 as $t)});
                     }
                     #[allow(unused_mut)]
                     let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index af985cf2..864fa5ec 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -119,7 +119,8 @@ macro_rules! fetch_sub_test{
             for idx in 0..array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(pe_max_val as usize){
-                    reqs.push(array.fetch_sub(idx,1 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{array.fetch_sub(idx,1 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -153,7 +154,8 @@ macro_rules! fetch_sub_test{
             // println!("2------------");
             for _i in 0..num_updates{
                 let idx = rand_idx.sample(&mut rng);
-                reqs.push((array.fetch_sub(idx,1 as $t),idx))
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.fetch_sub(idx,1 as $t)},idx))
             }
             for (req,_idx) in reqs{
                 let _val =  world.block_on(req);
@@ -182,7 +184,8 @@ macro_rules! fetch_sub_test{
             for idx in 0..sub_array.len(){
                 let mut reqs = vec![];
                 for _i in 0..(pe_max_val as usize){
-                    reqs.push(sub_array.fetch_sub(idx,1 as $t));
+                    #[allow(unused_unsafe)]
+                    reqs.push(unsafe{sub_array.fetch_sub(idx,1 as $t)});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -215,7 +218,8 @@ macro_rules! fetch_sub_test{
             // println!("2------------");
             for _i in 0..num_updates{
                 let idx = rand_idx.sample(&mut rng);
-                reqs.push((sub_array.fetch_sub(idx,1 as $t),idx))
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.fetch_sub(idx,1 as $t)},idx))
             }
             for (req,_idx) in reqs{
                 let _val =  world.block_on(req);
@@ -244,7 +248,8 @@ macro_rules! fetch_sub_test{
                 for idx in 0..sub_array.len(){
                     let mut reqs = vec![];
                     for _i in 0..(pe_max_val as usize){
-                        reqs.push(sub_array.fetch_sub(idx,1 as $t));
+                        #[allow(unused_unsafe)]
+                        reqs.push(unsafe{sub_array.fetch_sub(idx,1 as $t)});
                     }
                     #[allow(unused_mut)]
                     let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -276,7 +281,8 @@ macro_rules! fetch_sub_test{
                 // println!("2------------");
                 for _i in 0..num_updates{
                     let idx = rand_idx.sample(&mut rng);
-                    reqs.push((sub_array.fetch_sub(idx,1 as $t),idx))
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.fetch_sub(idx,1 as $t)},idx))
                 }
                 for (req,_idx) in reqs{
                     let _val =  world.block_on(req);
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index c3a19b56..5d937e3d 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -103,7 +103,8 @@ macro_rules! mul_test{
             // array.print();
             for idx in 0..array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ = array.mul(idx,2 as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{array.mul(idx,2 as $t).spawn()};
                 }
             }
             array.wait_all();
@@ -130,7 +131,8 @@ macro_rules! mul_test{
             // // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(max_updates as usize){
-                    let _ =  sub_array.mul(idx,2 as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ =  unsafe{sub_array.mul(idx,2 as $t).spawn()};
                 }
             }
             sub_array.wait_all();
@@ -156,7 +158,8 @@ macro_rules! mul_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(max_updates as usize){
-                        let _ = sub_array.mul(idx,2 as $t).spawn();
+                        #[allow(unused_unsafe)]
+                        let _ = unsafe{sub_array.mul(idx,2 as $t).spawn()};
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/arithmetic_ops/rem_test.rs b/tests/array/arithmetic_ops/rem_test.rs
index 9d4cf3ed..0b95f5c7 100644
--- a/tests/array/arithmetic_ops/rem_test.rs
+++ b/tests/array/arithmetic_ops/rem_test.rs
@@ -96,7 +96,8 @@ macro_rules! rem_test{
             // array.print();
             for idx in 0..array.len(){
                 for _i in 0..(max_updates as usize){
-                    array.rem(idx,2 as $t);
+                    #[allow(unused_unsafe)]
+                    unsafe{array.rem(idx,2 as $t)};
                 }
             }
             array.wait_all();
@@ -123,7 +124,8 @@ macro_rules! rem_test{
             // // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(max_updates as usize){
-                    sub_array.rem(idx,2 as $t);
+                    #[allow(unused_unsafe)]
+                    unsafe{sub_array.rem(idx,2 as $t)};
                 }
             }
             sub_array.wait_all();
@@ -149,7 +151,8 @@ macro_rules! rem_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     for _i in 0..(max_updates as usize){
-                        sub_array.rem(idx,2 as $t);
+                        #[allow(unused_unsafe)]
+                        unsafe{sub_array.rem(idx,2 as $t)};
                     }
                 }
                 sub_array.wait_all();
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index f3b78a7b..6d409f1a 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -104,7 +104,8 @@ macro_rules! sub_test{
 
                         for idx in 0..array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = array.sub(idx,1 as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{array.sub(idx,1 as $t).spawn()};
                 }
             }
             array.wait_all();
@@ -126,7 +127,8 @@ macro_rules! sub_test{
 
                         for _i in 0..num_updates  as usize{
                 let idx = rand_idx.sample(&mut rng);
-                let _ = array.sub(idx,1 as $t).spawn();
+                #[allow(unused_unsafe)]
+                let _ = unsafe{array.sub(idx,1 as $t).spawn()};
             }
             array.wait_all();
             array.barrier();
@@ -152,7 +154,8 @@ macro_rules! sub_test{
                         // sub_array.print();
             for idx in 0..sub_array.len(){
                 for _i in 0..(pe_max_val as usize){
-                    let _ = sub_array.sub(idx,1 as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
                 }
             }
             sub_array.wait_all();
@@ -174,7 +177,8 @@ macro_rules! sub_test{
 
                         for _i in 0..num_updates as usize{
                 let idx = rand_idx.sample(&mut rng);
-                let _ = sub_array.sub(idx,1 as $t).spawn();
+                #[allow(unused_unsafe)]
+                let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -200,7 +204,8 @@ macro_rules! sub_test{
                 sub_array.barrier();
                                 for idx in 0..sub_array.len(){
                     for _i in 0..(pe_max_val as usize){
-                        let _ = sub_array.sub(idx,1 as $t).spawn();
+                        #[allow(unused_unsafe)]
+                        let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
                     }
                 }
                 sub_array.wait_all();
@@ -222,7 +227,8 @@ macro_rules! sub_test{
 
                                 for _i in 0..num_updates as usize{
                     let idx = rand_idx.sample(&mut rng);
-                    let _ = sub_array.sub(idx,1 as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/atomic_ops/compare_exchange_test.rs b/tests/array/atomic_ops/compare_exchange_test.rs
index 278a304b..46d9f906 100644
--- a/tests/array/atomic_ops/compare_exchange_test.rs
+++ b/tests/array/atomic_ops/compare_exchange_test.rs
@@ -74,7 +74,8 @@ macro_rules! compare_exchange_test{
             let mut reqs = vec![];
             for idx in 0..array.len(){
                 if idx%num_pes == my_pe{
-                    reqs.push((array.compare_exchange(idx,init_val, my_pe as $t),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{array.compare_exchange(idx,init_val, my_pe as $t)},idx));
                 }
             }
             for (req,idx) in reqs{
@@ -94,7 +95,8 @@ macro_rules! compare_exchange_test{
             array.barrier();
             let mut reqs = vec![];
             for idx in 0..array.len(){ //these should all fail
-                reqs.push((array.compare_exchange(idx,init_val,my_pe as $t),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.compare_exchange(idx,init_val,my_pe as $t)},idx));
             }
             for (req,idx) in reqs{
                 match  world.block_on(req){
@@ -121,7 +123,8 @@ macro_rules! compare_exchange_test{
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
                 if idx%num_pes == my_pe{
-                    reqs.push((sub_array.compare_exchange(idx,init_val,my_pe as $t),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
                 }
             }
             for (req,idx) in reqs{
@@ -141,7 +144,8 @@ macro_rules! compare_exchange_test{
             sub_array.barrier();
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
-                reqs.push((sub_array.compare_exchange(idx,init_val,my_pe as $t),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
             }
             for (req,idx) in reqs{
                 match  world.block_on(req){
@@ -170,7 +174,8 @@ macro_rules! compare_exchange_test{
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        reqs.push((sub_array.compare_exchange(idx,init_val,my_pe as $t),idx));
+                        #[allow(unused_unsafe)]
+                        reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
                     }
                 }
                 for (req,idx) in reqs{
@@ -191,7 +196,8 @@ macro_rules! compare_exchange_test{
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        reqs.push((sub_array.compare_exchange(idx,init_val,my_pe as $t),idx));
+                        #[allow(unused_unsafe)]
+                        reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
                     }
                 }
                 for (req,idx) in reqs{
@@ -237,7 +243,8 @@ macro_rules! compare_exchange_epsilon_test{
             let mut reqs = vec![];
             for idx in 0..array.len(){
                 if idx%num_pes == my_pe{
-                    reqs.push((array.compare_exchange_epsilon(idx,init_val, my_pe as $t,epsilon),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{array.compare_exchange_epsilon(idx,init_val, my_pe as $t,epsilon)},idx));
                 }
             }
             for (req,idx) in reqs{
@@ -257,7 +264,8 @@ macro_rules! compare_exchange_epsilon_test{
             array.barrier();
             let mut reqs = vec![];
             for idx in 0..array.len(){ //these should all fail
-                reqs.push((array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
             }
             for (req,idx) in reqs{
                 match  world.block_on(req){
@@ -284,7 +292,8 @@ macro_rules! compare_exchange_epsilon_test{
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
                 if idx%num_pes == my_pe{
-                    reqs.push((sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
                 }
             }
             for (req,idx) in reqs{
@@ -304,7 +313,8 @@ macro_rules! compare_exchange_epsilon_test{
             sub_array.barrier();
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
-                reqs.push((sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
             }
             for (req,idx) in reqs{
                 match  world.block_on(req){
@@ -333,7 +343,8 @@ macro_rules! compare_exchange_epsilon_test{
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        reqs.push((sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon),idx));
+                        #[allow(unused_unsafe)]
+                        reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
                     }
                 }
                 for (req,idx) in reqs{
@@ -354,7 +365,8 @@ macro_rules! compare_exchange_epsilon_test{
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        reqs.push((sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon),idx));
+                        #[allow(unused_unsafe)]
+                        reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
                     }
                 }
                 for (req,idx) in reqs{
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index 88ccfaaa..90ab7b51 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -81,14 +81,16 @@ macro_rules! load_store_test{
             array.barrier();
             for idx in 0..array.len(){
                 if idx%num_pes == my_pe{
-                    let _ = array.store(idx,my_pe as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{array.store(idx,my_pe as $t).spawn()};
                 }
             }
             array.wait_all();
             array.barrier();
             let mut reqs = vec![];
             for idx in 0..array.len(){
-                reqs.push((array.load(idx),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.load(idx)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -114,7 +116,8 @@ macro_rules! load_store_test{
             sub_array.barrier();
             for idx in 0..sub_array.len(){
                 if idx%num_pes == my_pe{
-                    let _ = sub_array.store(idx,my_pe as $t).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{sub_array.store(idx,my_pe as $t).spawn()};
                 }
             }
             sub_array.wait_all();
@@ -122,7 +125,8 @@ macro_rules! load_store_test{
 
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
-                reqs.push((sub_array.load(idx),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.load(idx)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -150,7 +154,8 @@ macro_rules! load_store_test{
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        let _ = sub_array.store(idx,my_pe as $t).spawn();
+                        #[allow(unused_unsafe)]
+                        let _ = unsafe{sub_array.store(idx,my_pe as $t).spawn()};
                     }
                 }
                 sub_array.wait_all();
@@ -158,7 +163,8 @@ macro_rules! load_store_test{
 
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
-                    reqs.push((sub_array.load(idx),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.load(idx)},idx));
                 }
                 for (req,idx) in reqs{
                     let val =  world.block_on(req);
diff --git a/tests/array/atomic_ops/swap_test.rs b/tests/array/atomic_ops/swap_test.rs
index 49b1afb9..2e71e926 100644
--- a/tests/array/atomic_ops/swap_test.rs
+++ b/tests/array/atomic_ops/swap_test.rs
@@ -74,7 +74,8 @@ macro_rules! swap{
             let mut reqs = vec![];
             for idx in 0..array.len(){
                 if idx%num_pes == my_pe{
-                    reqs.push((array.swap(idx,my_pe as $t),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{array.swap(idx,my_pe as $t)},idx));
                 }
             }
             for (req,idx) in reqs{
@@ -90,7 +91,8 @@ macro_rules! swap{
 
             let mut reqs = vec![];
             for idx in 0..array.len(){
-                reqs.push((array.load(idx),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.load(idx)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -118,7 +120,8 @@ macro_rules! swap{
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
                 if idx%num_pes == my_pe{
-                    reqs.push((sub_array.swap(idx,my_pe as $t),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.swap(idx,my_pe as $t)},idx));
                 }
             }
             for (req,idx) in reqs{
@@ -134,7 +137,8 @@ macro_rules! swap{
 
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
-                reqs.push((sub_array.load(idx),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.load(idx)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -164,7 +168,8 @@ macro_rules! swap{
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
                     if idx%num_pes == my_pe{
-                        reqs.push((sub_array.swap(idx,my_pe as $t),idx));
+                        #[allow(unused_unsafe)]
+                        reqs.push((unsafe{sub_array.swap(idx,my_pe as $t)},idx));
                     }
                 }
                 for (req,idx) in reqs{
@@ -180,7 +185,9 @@ macro_rules! swap{
 
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
-                    reqs.push((sub_array.load(idx),idx));
+
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.load(idx)},idx));
                 }
                 for (req,idx) in reqs{
                     let val =  world.block_on(req);
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 94ed932d..0e920739 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -84,7 +84,8 @@ macro_rules! and_test{
             array.barrier();
             let my_val = !(1 as $t << my_pe);
             for idx in 0..array.len(){
-                let _ = array.bit_and(idx,my_val).spawn();
+                #[allow(unused_unsafe)]
+                let _ = unsafe{array.bit_and(idx,my_val).spawn()};
 
             }
             array.wait_all();
@@ -112,7 +113,8 @@ macro_rules! and_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                let _ = sub_array.bit_and(idx,my_val).spawn();
+                #[allow(unused_unsafe)]
+                let _ = unsafe{sub_array.bit_and(idx,my_val).spawn()};
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -140,7 +142,8 @@ macro_rules! and_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    let _ = sub_array.bit_and(idx,my_val).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{sub_array.bit_and(idx,my_val).spawn()};
                 }
                 sub_array.wait_all();
                 sub_array.barrier();
diff --git a/tests/array/bitwise_ops/fetch_and_test.rs b/tests/array/bitwise_ops/fetch_and_test.rs
index 3fbb760a..9a15290e 100644
--- a/tests/array/bitwise_ops/fetch_and_test.rs
+++ b/tests/array/bitwise_ops/fetch_and_test.rs
@@ -86,7 +86,8 @@ macro_rules! fetch_and_test{
 
             let mut reqs = vec![];
             for idx in 0..array.len(){
-                reqs.push((array.fetch_bit_and(idx,my_val),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.fetch_bit_and(idx,my_val)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -123,7 +124,8 @@ macro_rules! fetch_and_test{
 
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
-                reqs.push((sub_array.fetch_bit_and(idx,my_val),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.fetch_bit_and(idx,my_val)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -161,7 +163,8 @@ macro_rules! fetch_and_test{
 
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
-                    reqs.push((sub_array.fetch_bit_and(idx,my_val),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.fetch_bit_and(idx,my_val)},idx));
                 }
                 for (req,idx) in reqs{
                     let val =  world.block_on(req);
diff --git a/tests/array/bitwise_ops/fetch_or_test.rs b/tests/array/bitwise_ops/fetch_or_test.rs
index 6b220433..f635dacb 100644
--- a/tests/array/bitwise_ops/fetch_or_test.rs
+++ b/tests/array/bitwise_ops/fetch_or_test.rs
@@ -86,7 +86,8 @@ macro_rules! fetch_or_test{
 
             let mut reqs = vec![];
             for idx in 0..array.len(){
-                reqs.push((array.fetch_bit_or(idx,my_val),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.fetch_bit_or(idx,my_val)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -123,7 +124,8 @@ macro_rules! fetch_or_test{
 
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
-                reqs.push((sub_array.fetch_bit_or(idx,my_val),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.fetch_bit_or(idx,my_val)},idx));
             }
             for (req,idx)  in reqs{
                 let val =  world.block_on(req);
@@ -161,7 +163,8 @@ macro_rules! fetch_or_test{
 
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
-                    reqs.push((sub_array.fetch_bit_or(idx,my_val),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.fetch_bit_or(idx,my_val)},idx));
                 }
                 for (req,idx)  in reqs{
                     let val =  world.block_on(req);
diff --git a/tests/array/bitwise_ops/fetch_xor_test.rs b/tests/array/bitwise_ops/fetch_xor_test.rs
index 0f203699..8302a766 100644
--- a/tests/array/bitwise_ops/fetch_xor_test.rs
+++ b/tests/array/bitwise_ops/fetch_xor_test.rs
@@ -86,7 +86,8 @@ macro_rules! fetch_xor_test{
 
             let mut reqs = vec![];
             for idx in 0..array.len(){
-                reqs.push((array.fetch_bit_xor(idx,my_val),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{array.fetch_bit_xor(idx,my_val)},idx));
             }
             for (req,idx) in reqs{
                 let val =  world.block_on(req);
@@ -123,7 +124,8 @@ macro_rules! fetch_xor_test{
 
             let mut reqs = vec![];
             for idx in 0..sub_array.len(){
-                reqs.push((sub_array.fetch_bit_xor(idx,my_val),idx));
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe{sub_array.fetch_bit_xor(idx,my_val)},idx));
             }
             for (req,idx)  in reqs{
                 let val =  world.block_on(req);
@@ -161,7 +163,8 @@ macro_rules! fetch_xor_test{
 
                 let mut reqs = vec![];
                 for idx in 0..sub_array.len(){
-                    reqs.push((sub_array.fetch_bit_xor(idx,my_val),idx));
+                    #[allow(unused_unsafe)]
+                    reqs.push((unsafe{sub_array.fetch_bit_xor(idx,my_val)},idx));
                 }
                 for (req,idx)  in reqs{
                     let val =  world.block_on(req);
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index fa52c7a3..5c697397 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -84,7 +84,8 @@ macro_rules! or_test{
             array.barrier();
             let my_val = 1 as $t << my_pe;
             for idx in 0..array.len(){
-                let _ = array.bit_or(idx,my_val).spawn();
+                #[allow(unused_unsafe)]
+                let _ = unsafe{array.bit_or(idx,my_val).spawn()};
 
             }
             array.wait_all();
@@ -112,7 +113,8 @@ macro_rules! or_test{
             sub_array.barrier();
             // sub_array.print();
             for idx in 0..sub_array.len(){
-                let _ = sub_array.bit_or(idx,my_val).spawn();
+                #[allow(unused_unsafe)]
+                let _ = unsafe {sub_array.bit_or(idx,my_val).spawn()};
             }
             sub_array.wait_all();
             sub_array.barrier();
@@ -140,7 +142,8 @@ macro_rules! or_test{
                 let sub_array = array.sub_array(start_i..end_i);
                 sub_array.barrier();
                 for idx in 0..sub_array.len(){
-                    let _ = sub_array.bit_or(idx,my_val).spawn();
+                    #[allow(unused_unsafe)]
+                    let _ = unsafe{sub_array.bit_or(idx,my_val).spawn()};
                 }
                 sub_array.wait_all();
                 sub_array.barrier();

From e4078fb0cee08ef79328e8a7775bf0e1bc7908aa Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 3 Oct 2024 17:38:25 -0700
Subject: [PATCH 077/116] fixed active message doc tests

---
 impl/src/field_info.rs     |  24 ++++-----
 src/active_messaging.rs    | 101 +++++++++++++++++++------------------
 src/array.rs               |  24 ++++-----
 src/lamellar_task_group.rs |   8 +--
 src/lamellar_team.rs       |   6 +--
 5 files changed, 82 insertions(+), 81 deletions(-)

diff --git a/impl/src/field_info.rs b/impl/src/field_info.rs
index 44908a23..15b0dc78 100644
--- a/impl/src/field_info.rs
+++ b/impl/src/field_info.rs
@@ -70,7 +70,7 @@ impl FieldInfo {
         if as_vecs && darc_iter {
             //both
             quote! {
-                for e in (&self.#field_name).iter(){
+                for e in (&(self.#field_name)).iter(){
                     for d in e.iter(){
                         d.ser(num_pes,darcs);
                     }
@@ -79,14 +79,14 @@ impl FieldInfo {
         } else if as_vecs ^ darc_iter {
             //either or
             quote! {
-                for e in (&self.#field_name).iter(){
+                for e in (&(self.#field_name)).iter(){
                     e.ser(num_pes,darcs);
                 }
             }
         } else {
             //neither
             quote! {
-                (&self.#field_name).ser(num_pes,darcs);
+                (&(self.#field_name)).ser(num_pes,darcs);
             }
         }
     }
@@ -109,12 +109,12 @@ impl FieldInfo {
                 ind += 1;
                 if !as_vecs {
                     ser.extend(quote_spanned! {field.span()=>
-                        ( &self.#field_name.#temp_ind).ser(num_pes,darcs);
+                        ( &(self.#field_name.#temp_ind)).ser(num_pes,darcs);
                     });
                 } else {
                     ser.extend(quote_spanned! {field.span()=>
-                        for e in (&self.#field_name).iter(){
-                            e.#temp_ind.ser(num_pes,darcs);
+                        for e in (&(self.#field_name)).iter(){
+                            (&(e.#temp_ind)).ser(num_pes,darcs);
                         }
                     })
                 }
@@ -178,7 +178,7 @@ impl FieldInfo {
         if as_vecs && darc_iter {
             //both
             quote! {
-                for e in (&self.#field_name).iter(){
+                for e in (&(self.#field_name)).iter(){
                     for d in e.iter(){
                         d.des(cur_pe);
                     }
@@ -187,14 +187,14 @@ impl FieldInfo {
         } else if as_vecs ^ darc_iter {
             //either or
             quote! {
-                for e in (&self.#field_name).iter(){
+                for e in (&(self.#field_name)).iter(){
                     e.des(cur_pe);
                 }
             }
         } else {
             //neither
             quote! {
-                (&self.#field_name).des(cur_pe);
+                (&(self.#field_name)).des(cur_pe);
             }
         }
     }
@@ -217,12 +217,12 @@ impl FieldInfo {
                 ind += 1;
                 if !as_vecs {
                     des.extend(quote_spanned! {field.span()=>
-                        ( &self.#field_name.#temp_ind).des(cur_pe);
+                        ( &(self.#field_name.#temp_ind)).des(cur_pe);
                     });
                 } else {
                     des.extend(quote_spanned! {field.span()=>
-                        for e in (&self.#field_name).iter(){
-                            e.#temp_ind.des(cur_pe);
+                        for e in (&(self.#field_name)).iter(){
+                            (&(e.#temp_ind)).des(cur_pe);
                         }
                     });
                 }
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 3fcb12fe..f46c0396 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -267,7 +267,7 @@
 //! The main change is that we need to explicitly tell the macro we are returning an active message and we provide the name of the active message we are returning
 //!```
 //! # use lamellar::active_messaging::prelude::*;
-//! # #[AmData(Debug,Clone)]
+//! # #[lamellar::AmData(Debug,Clone)]
 //! # struct ReturnAm{
 //! #     original_pe: usize,
 //! #     remote_pe: usize,
@@ -278,13 +278,13 @@
 //! #         println!("initiated on PE {} visited PE {} finishing on PE {}",self.original_pe,self.remote_pe,lamellar::current_pe);
 //! #     }
 //! # }
-//! # #[AmData(Debug,Clone)]
+//! # #[lamellar::AmData(Debug,Clone)]
 //! # struct HelloWorld {
 //! #    original_pe: usize, //this will contain the ID of the PE this data originated from
 //! # }
 //! #[lamellar::am(return_am = "ReturnAm")] //we explicitly tell the macro we are returning an AM
 //! impl LamellarAM for HelloWorld {
-//!     async fn exec(self) -> usize { //specify we are returning a usize
+//!     async fn exec(self) -> ReturnAm { //we want to return an instance of an AM
 //!         println!(
 //!             "Hello World on PE {:?} of {:?}, I'm from PE {:?}",
 //!             lamellar::current_pe,
@@ -332,7 +332,7 @@
 //! First we need to update `ReturnAm` to actually return some data
 //!```
 //! # use lamellar::active_messaging::prelude::*;
-//! # #[AmData(Debug,Clone)]
+//! # #[lamellar::AmData(Debug,Clone)]
 //! # struct ReturnAm{
 //! #     original_pe: usize,
 //! #     remote_pe: usize,
@@ -369,7 +369,7 @@
 //!
 //! #[lamellar::am(return_am = "ReturnAm -> (usize,usize)")] //we explicitly tell the macro we are returning an AM which itself returns data
 //! impl LamellarAM for HelloWorld {
-//!     async fn exec(self) -> usize { //specify we are returning a usize
+//!     async fn exec(self) -> ReturnAm { //returning an instance of an AM
 //!         println!(
 //!             "Hello World on PE {:?} of {:?}, I'm from PE {:?}",
 //!             lamellar::current_pe,
@@ -472,7 +472,7 @@
 //! Lamellar also supports active message groups, which is a collection of active messages that can be awaited together.
 //! Conceptually, an active message group can be represented as a meta active message that contains a list of the actual active messages we want to execute,
 //! as illustrated in the pseudocode below:
-//! ```no_run
+//! ```ignore
 //! #[AmData(Debug,Clone)]
 //! struct MetaAm{
 //!     ams: Vec<impl LamellarAm>
@@ -500,8 +500,8 @@
 //!    foo: usize,
 //! }
 //! #[lamellar::am]
-//! impl LamellarAm for RingAm{
-//!     async fn exec(self) -> Vec<usize>{
+//! impl LamellarAm for Am1{
+//!     async fn exec(self) {
 //!         println!("in am1 {:?} on PE{:?}",self.foo,  lamellar::current_pe);
 //!     }
 //! }
@@ -511,8 +511,8 @@
 //!    bar: String,
 //! }
 //! #[lamellar::am]
-//! impl LamellarAm for RingAm{
-//!     async fn exec(self) -> Vec<usize>{
+//! impl LamellarAm for Am2{
+//!     async fn exec(self) {
 //!         println!("in am2 {:?} on PE{:?}",self.bar,lamellar::current_pe);
 //!     }
 //! }
@@ -525,7 +525,7 @@
 //!     let am1 = Am1{foo: 1};
 //!     let am2 = Am2{bar: "hello".to_string()};
 //!     //create a new AMGroup
-//!     let am_group = AMGroup::new(&world);
+//!     let mut am_group = AmGroup::new(&world);
 //!     // add the AMs to the group
 //!     // we can specify individual PEs to execute on or all PEs
 //!     am_group.add_am_pe(0,am1.clone());
@@ -566,7 +566,7 @@
 //! #[lamellar::am]
 //! impl LamellarAm for ExampleAm{
 //!     async fn exec(self) -> usize{
-//!         self.cnt.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
+//!         self.cnt.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
 //!     }
 //! }
 //!
@@ -574,10 +574,11 @@
 //!     let world = lamellar::LamellarWorldBuilder::new().build();
 //!     let my_pe = world.my_pe();
 //!     let num_pes = world.num_pes();
+//!     let darc = Darc::new(&world,AtomicUsize::new(0)).expect("PE in world team");
 //!
 //!     if my_pe == 0 { // we only want to run this on PE0 for sake of illustration
-//!         let am_group = typed_am_group!{ExampleAm,&world};
-//!         let am = ExampleAm{cnt: 0};
+//!         let mut am_group = typed_am_group!{ExampleAm,&world};
+//!         let am = ExampleAm{cnt: darc.clone()};
 //!         // add the AMs to the group
 //!         // we can specify individual PEs to execute on or all PEs
 //!         am_group.add_am_pe(0,am.clone());
@@ -588,15 +589,15 @@
 //!         //execute and await the completion of all AMs in the group
 //!         let results = world.block_on(am_group.exec()); // we want to process the returned data
 //!         //we can index into the results
-//!         if let AmGroupResult::Pe((pe,val)) = results.at(2){
+//!         if let AmGroupResult::Pe(pe,val) = results.at(2){
 //!             assert_eq!(pe, 1); //the third add_am_* call in the group was to execute on PE1
-//!             assert_eq!(val, 1); // this was the second am to execute on PE1 so the fetched value is 1
+//!             assert_eq!(val, &1); // this was the second am to execute on PE1 so the fetched value is 1
 //!         }
 //!         //or we can iterate over the results
-//!         for res in results{
+//!         for res in results.iter(){
 //!             match res{
-//!                 AmGroupResult::Pe((pe,val)) => { println!("{} from PE{}",val,pe)},
-//!                 AmGroupResult::All(val) => { println!("{} on all PEs",val)},
+//!                 AmGroupResult::Pe(pe,val) => { println!("{:?} from PE{:?}",val,pe)},
+//!                 AmGroupResult::All(val) => { println!("{:?} on all PEs",val)},
 //!             }
 //!         }
 //!     }
@@ -626,7 +627,7 @@
 //! use std::sync::atomic::AtomicUsize;
 //! #[AmData(Debug,Clone)]
 //! struct ExampleAm {
-//!    #[AmData(static)]
+//!    #[AmGroup(static)]
 //!    cnt: Darc<AtomicUsize>,
 //! }
 //!```
@@ -973,13 +974,13 @@ pub trait ActiveMessaging {
     /// use lamellar::active_messaging::prelude::*;
     ///
     /// #[lamellar::AmData(Debug,Clone)]
-    /// struct Am{
+    /// struct MyAm{
     /// // can contain anything that impls Serialize, Deserialize, Sync, Send   
     ///     val: usize
     /// }
     ///
     /// #[lamellar::am]
-    /// impl LamellarAM for Am{
+    /// impl LamellarAM for MyAm{
     ///     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     ///         //do some remote computation
     ///         println!("hello from PE{}",self.val);
@@ -989,7 +990,7 @@ pub trait ActiveMessaging {
     /// //----------------
     ///
     /// let world = lamellar::LamellarWorldBuilder::new().build();
-    /// let request = world.exec_am_all(Am{val: world.my_pe()}); //launch am on all pes
+    /// let request = world.exec_am_all(MyAm{val: world.my_pe()}); //launch am on all pes
     /// let results = world.block_on(request); //block until am has executed and retrieve the data
     /// for i in 0..world.num_pes(){
     ///     assert_eq!(i,results[i]);
@@ -1020,13 +1021,13 @@ pub trait ActiveMessaging {
     /// use lamellar::active_messaging::prelude::*;
     ///
     /// #[lamellar::AmData(Debug,Clone)]
-    /// struct Am{
+    /// struct MyAm{
     /// // can contain anything that impls Serialize, Deserialize, Sync, Send   
     ///     val: usize
     /// }
     ///
     /// #[lamellar::am]
-    /// impl LamellarAM for Am{
+    /// impl LamellarAM for MyAm{
     ///     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     ///         //do some remote computation
     ///         println!("hello from PE{}",self.val);
@@ -1036,7 +1037,7 @@ pub trait ActiveMessaging {
     /// //----------------
     ///
     /// let world = lamellar::LamellarWorldBuilder::new().build();
-    /// let request = world.exec_am_pe(world.num_pes()-1, Am{val: world.my_pe()}); //launch am on all pes
+    /// let request = world.exec_am_pe(world.num_pes()-1, MyAm{val: world.my_pe()}); //launch am on all pes
     /// let result = world.block_on(request); //block until am has executed
     /// assert_eq!(world.num_pes()-1,result);
     ///```
@@ -1067,13 +1068,13 @@ pub trait ActiveMessaging {
     /// use std::sync::Arc;
     ///
     /// #[lamellar::AmLocalData(Debug,Clone)]
-    /// struct Am{
+    /// struct MyAm{
     /// // can contain anything that impls Sync, Send  
     ///     val: Arc<Mutex<f32>>,
     /// }
     ///
     /// #[lamellar::local_am]
-    /// impl LamellarAM for Am{
+    /// impl LamellarAM for MyAm{
     ///     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     ///         //do some  computation
     ///         let mut val = self.val.lock();
@@ -1084,7 +1085,7 @@ pub trait ActiveMessaging {
     /// //----------------
     ///
     /// let world = lamellar::LamellarWorldBuilder::new().build();
-    /// let request = world.exec_am_local(Am{val: Arc::new(Mutex::new(0.0))}); //launch am locally
+    /// let request = world.exec_am_local(MyAm{val: Arc::new(Mutex::new(0.0))}); //launch am locally
     /// let result = world.block_on(request); //block until am has executed
     /// assert_eq!(world.my_pe(),result);
     ///```
@@ -1106,13 +1107,13 @@ pub trait ActiveMessaging {
     /// # use lamellar::active_messaging::prelude::*;
     /// #
     /// # #[lamellar::AmData(Debug,Clone)]
-    /// # struct Am{
+    /// # struct MyAm{
     /// # // can contain anything that impls Sync, Send  
     /// #     val: usize,
     /// # }
     ///
     /// # #[lamellar::am]
-    /// # impl LamellarAM for Am{
+    /// # impl LamellarAM for MyAm{
     /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     /// #         //do some remote computation
     /// #          println!("hello from PE{}",self.val);
@@ -1121,7 +1122,7 @@ pub trait ActiveMessaging {
     /// # }
     /// #
     /// # let world = lamellar::LamellarWorldBuilder::new().build();
-    /// world.exec_am_all(Am{val: world.my_pe()});
+    /// world.exec_am_all(MyAm{val: world.my_pe()});
     /// world.wait_all(); //block until the previous am has finished
     ///```
     fn wait_all(&self);
@@ -1140,13 +1141,13 @@ pub trait ActiveMessaging {
     /// # use lamellar::active_messaging::prelude::*;
     /// #
     /// # #[lamellar::AmData(Debug,Clone)]
-    /// # struct Am{
+    /// # struct MyAm{
     /// # // can contain anything that impls Sync, Send  
     /// #     val: usize,
     /// # }
     ///
     /// # #[lamellar::am]
-    /// # impl LamellarAM for Am{
+    /// # impl LamellarAM for MyAm{
     /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     /// #         //do some remote computation
     /// #          println!("hello from PE{}",self.val);
@@ -1157,7 +1158,7 @@ pub trait ActiveMessaging {
     /// # let world = lamellar::LamellarWorldBuilder::new().build();
     /// let world_clone = world.clone();
     /// world.block_on(async move {
-    ///     world_clone.exec_am_all(Am{val: world_clone.my_pe()});
+    ///     world_clone.exec_am_all(MyAm{val: world_clone.my_pe()});
     ///     world_clone.await_all().await; //block until the previous am has finished
     /// });
     ///```
@@ -1219,13 +1220,13 @@ pub trait ActiveMessaging {
     /// use async_std::fs::File;
     /// use async_std::prelude::*;
     /// # #[lamellar::AmData(Debug,Clone)]
-    /// # struct Am{
+    /// # struct MyAm{
     /// # // can contain anything that impls Sync, Send  
     /// #     val: usize,
     /// # }
     /// #
     /// # #[lamellar::am]
-    /// # impl LamellarAM for Am{
+    /// # impl LamellarAM for MyAm{
     /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     /// #         //do some remote computation
     /// #          println!("hello from PE{}",self.val);
@@ -1235,7 +1236,7 @@ pub trait ActiveMessaging {
     /// #
     /// # let world = lamellar::LamellarWorldBuilder::new().build();
     /// # let num_pes = world.num_pes();
-    /// let request = world.exec_am_all(Am{val: world.my_pe()}); //launch am locally
+    /// let request = world.exec_am_all(MyAm{val: world.my_pe()}); //launch am locally
     /// let result = world.block_on(request); //block until am has executed
     /// // you can also directly pass an async block
     /// let world_clone = world.clone();
@@ -1244,9 +1245,9 @@ pub trait ActiveMessaging {
     ///     let mut buf = vec![0u8;1000];
     ///     for pe in 0..num_pes{
     ///         let data = file.read(&mut buf).await.unwrap();
-    ///         world_clone.exec_am_pe(pe,Am{val: data}).await;
+    ///         world_clone.exec_am_pe(pe,MyAm{val: data}).await;
     ///     }
-    ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
+    ///     world_clone.exec_am_all(MyAm{val: buf[0] as usize}).await;
     /// });
     ///```
     fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
@@ -1270,13 +1271,13 @@ pub trait ActiveMessaging {
     /// use async_std::fs::File;
     /// use async_std::prelude::*;
     /// # #[lamellar::AmData(Debug,Clone)]
-    /// # struct Am{
+    /// # struct MyAm{
     /// # // can contain anything that impls Sync, Send  
     /// #     val: usize,
     /// # }
     /// #
     /// # #[lamellar::am]
-    /// # impl LamellarAM for Am{
+    /// # impl LamellarAM for MyAm{
     /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     /// #         //do some remote computation
     /// #          println!("hello from PE{}",self.val);
@@ -1286,7 +1287,7 @@ pub trait ActiveMessaging {
     /// #
     /// # let world = lamellar::LamellarWorldBuilder::new().build();
     /// # let num_pes = world.num_pes();
-    /// let request = world.exec_am_all(Am{val: world.my_pe()}); //launch am locally
+    /// let request = world.exec_am_all(MyAm{val: world.my_pe()}); //launch am locally
     /// let result = world.block_on(request); //block until am has executed
     /// // you can also directly pass an async block
     /// let world_clone = world.clone();
@@ -1295,9 +1296,9 @@ pub trait ActiveMessaging {
     ///     let mut buf = vec![0u8;1000];
     ///     for pe in 0..num_pes{
     ///         let data = file.read(&mut buf).await.unwrap();
-    ///         world_clone.exec_am_pe(pe,Am{val: data}).await;
+    ///         world_clone.exec_am_pe(pe,MyAm{val: data}).await;
     ///     }
-    ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
+    ///     world_clone.exec_am_all(MyAm{val: buf[0] as usize}).await;
     /// });
     ///```
     fn block_on<F: Future>(&self, f: F) -> F::Output;
@@ -1318,13 +1319,13 @@ pub trait ActiveMessaging {
     /// use async_std::fs::File;
     /// use async_std::prelude::*;
     /// # #[lamellar::AmData(Debug,Clone)]
-    /// # struct Am{
+    /// # struct MyAm{
     /// # // can contain anything that impls Sync, Send  
     /// #     val: usize,
     /// # }
     /// #
     /// # #[lamellar::am]
-    /// # impl LamellarAM for Am{
+    /// # impl LamellarAM for MyAm{
     /// #     async fn exec(self) -> usize { //can return nothing or any type that impls Serialize, Deserialize, Sync, Send
     /// #         //do some remote computation
     /// #          println!("hello from PE{}",self.val);
@@ -1334,7 +1335,7 @@ pub trait ActiveMessaging {
     /// #
     /// # let world = lamellar::LamellarWorldBuilder::new().build();
     /// # let num_pes = world.num_pes();
-    /// let request = world.exec_am_all(Am{val: world.my_pe()}); //launch am locally
+    /// let request = world.exec_am_all(MyAm{val: world.my_pe()}); //launch am locally
     /// let result = world.block_on(request); //block until am has executed
     /// // you can also directly pass an async block
     /// let world_clone = world.clone();
@@ -1343,9 +1344,9 @@ pub trait ActiveMessaging {
     ///     let mut buf = vec![0u8;1000];
     ///     for pe in 0..num_pes{
     ///         let data = file.read(&mut buf).await.unwrap();
-    ///         world_clone.exec_am_pe(pe,Am{val: data}).await;
+    ///         world_clone.exec_am_pe(pe,MyAm{val: data}).await;
     ///     }
-    ///     world_clone.exec_am_all(Am{val: buf[0] as usize}).await;
+    ///     world_clone.exec_am_all(MyAm{val: buf[0] as usize}).await;
     /// });
     ///```
     fn block_on_all<I>(&self, iter: I) -> Vec<<<I as IntoIterator>::Item as Future>::Output>
diff --git a/src/array.rs b/src/array.rs
index aac1d98f..00eb7e57 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -191,26 +191,26 @@ pub struct ReduceKey {
 crate::inventory::collect!(ReduceKey);
 
 // impl Dist for bool {}
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 
 // lamellar_impl::generate_reductions_for_type_rt!(false, f64);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f64);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
 lamellar_impl::generate_ops_for_bool_rt!();
 
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 65f9cd98..d3c9c6d4 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -526,13 +526,13 @@ impl<T: 'static> Future for TaskGroupLocalAmHandle<T> {
 /// use lamellar::active_messaging::prelude::*;
 ///
 /// #[AmData(Debug,Clone)]
-/// struct Am{
+/// struct MyAm{
 ///     world_pe: usize,
 ///     team_pe: Option<usize>,
 /// }
 ///
 /// #[lamellar::am]
-/// impl LamellarAm for Am{
+/// impl LamellarAm for MyAm{
 ///     async fn exec(self) {
 ///         println!("Hello from world PE{:?}, team PE{:?}",self.world_pe, self.team_pe);
 ///     }
@@ -553,9 +553,9 @@ impl<T: 'static> Future for TaskGroupLocalAmHandle<T> {
 /// };
 /// let task_group_1 = LamellarTaskGroup::new(&world); //associate the task group with the world
 /// let task_group_2 = LamellarTaskGroup::new(&even_pes); //we can also associate the task group with a team/sub_team
-/// task_group_1.exec_am_all(Am{world_pe,team_pe});
+/// task_group_1.exec_am_all(MyAm{world_pe,team_pe});
 /// for pe in 0..even_pes.num_pes(){
-///    task_group_2.exec_am_pe(pe,Am{world_pe,team_pe});
+///    task_group_2.exec_am_pe(pe,MyAm{world_pe,team_pe});
 /// }
 /// task_group_1.wait_all(); //only need to wait for active messages launched with task_group_1 to finish
 /// //do interesting work
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 424701b0..e783e06f 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -50,13 +50,13 @@ use std::marker::PhantomData;
 /// use lamellar::array::prelude::*;
 ///
 /// #[AmData(Debug,Clone)]
-/// struct Am{
+/// struct MyAm{
 ///     world_pe: usize,
 ///     team_pe: Option<usize>,
 /// }
 ///
 /// #[lamellar::am]
-/// impl LamellarAm for Am{
+/// impl LamellarAm for MyAm{
 ///     async fn exec(self) {
 ///         println!("Hello from world PE{:?}, team PE{:?}",self.world_pe, self.team_pe);
 ///     }
@@ -77,7 +77,7 @@ use std::marker::PhantomData;
 ///     Err(_) => None,
 /// };
 /// // we can launch and await the results of active messages on a given team
-/// let req = even_pes.exec_am_all(Am{world_pe,team_pe});
+/// let req = even_pes.exec_am_all(MyAm{world_pe,team_pe});
 /// let result = even_pes.block_on(req);
 /// // we can also create a distributed array so that its data only resides on the members of the team.
 /// let array: AtomicArray<usize> = AtomicArray::new(&even_pes, 100,Distribution::Block);

From 400e5f27ccb7f2cc2604894543f561ba411c88ea Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 3 Oct 2024 21:48:13 -0700
Subject: [PATCH 078/116] fixed darc doc tests

---
 src/darc.rs                    |  2 +-
 src/darc/global_rw_darc.rs     |  6 +--
 src/darc/local_rw_darc.rs      | 16 +++----
 src/lamellae/local_lamellae.rs | 77 +++++++++++++++++++++++++++++-----
 4 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/src/darc.rs b/src/darc.rs
index 2d0dcc67..6312e73c 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -272,7 +272,7 @@ impl<'de, T: 'static> Deserialize<'de> for Darc<T> {
 ///     let my_pe = world.my_pe();
 ///     let num_pes = world.num_pes();
 ///     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).unwrap();
-///     let weak = darc_counter.downgrade();
+///     let weak = Darc::downgrade(&darc_counter);
 ///     match weak.upgrade(){
 ///         Some(counter) => {
 ///             counter.fetch_add(my_pe, Ordering::SeqCst);
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 7225333a..15065a83 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -634,8 +634,8 @@ impl<T> GlobalRwDarc<T> {
     ///     let mut guard = counter.write().await;
     ///     *guard += my_pe;
     ///     drop(guard); //release the
-    ///     world.wait_all(); // wait for my active message to return
-    ///     world.barrier(); //at this point all updates will have been performed
+    ///     world.await_all().await; // wait for my active message to return
+    ///     world.async_barrier().await; //at this point all updates will have been performed
     /// });
     ///```
     pub async fn write(&self) -> GlobalRwDarcWriteGuard<T> {
@@ -888,7 +888,7 @@ impl<T> GlobalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let mut counter = self.counter.async_write().await; // await until we get the write lock
+    ///         let mut counter = self.counter.write().await; // await until we get the write lock
     ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
     ///     }
     ///  }
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 3215eee0..8892987e 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -154,7 +154,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let counter = self.counter.read(); //block until we get the write lock
+    ///         let counter = self.counter.read().await; //block until we get the write lock
     ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
     ///     }
     ///  }
@@ -260,8 +260,8 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// #[lamellar::am]
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
-    ///         let mut counter = self.counter.write(); //block until we get the write lock
-    ///         **counter += 1;
+    ///         let mut counter = self.counter.write().await; //block until we get the write lock
+    ///         *counter += 1;
     ///     }
     ///  }
     /// //-------------
@@ -270,7 +270,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// world.exec_am_all(DarcAm {counter: counter.clone()});
     /// let mut guard = counter.blocking_write();
-    /// **guard += my_pe;
+    /// *guard += my_pe;
     ///```
     pub fn blocking_write(&self) -> RwLockWriteGuardArc<T> {
         if std::thread::current().id() != *crate::MAIN_THREAD {
@@ -320,17 +320,17 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// impl LamellarAm for DarcAm {
     ///     async fn exec(self) {
     ///         let mut counter = self.counter.write().await; //block until we get the write lock
-    ///         **counter += 1;
+    ///         *counter += 1;
     ///     }
     ///  }
     /// //-------------
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// world.clone()block_on(async move{
+    /// world.clone().block_on(async move{
     ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
     ///     world.exec_am_all(DarcAm {counter: counter.clone()});
-    ///     let mut guard = counter.write();
-    ///     **guard += my_pe;
+    ///     let mut guard = counter.write().await;
+    ///     *guard += my_pe;
     /// })
     ///```
     pub async fn write(&self) -> RwLockWriteGuardArc<T> {
diff --git a/src/lamellae/local_lamellae.rs b/src/lamellae/local_lamellae.rs
index cc84b62b..fef4dccf 100644
--- a/src/lamellae/local_lamellae.rs
+++ b/src/lamellae/local_lamellae.rs
@@ -135,29 +135,86 @@ unsafe impl Send for MyPtr {}
 impl LamellaeRDMA for Local {
     fn flush(&self) {}
     fn put(&self, _pe: usize, src: &[u8], dst: usize) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(src.as_ptr(), dst as *mut u8, src.len());
+        let src_ptr = src.as_ptr();
+
+        if !((src_ptr as usize <= dst
+            && dst < src_ptr as usize + src.len()) //dst start overlaps src
+            || (src_ptr as usize <= dst + src.len()
+            && dst + src.len() < src_ptr as usize + src.len()))
+        //dst end overlaps src
+        {
+            unsafe {
+                std::ptr::copy_nonoverlapping(src.as_ptr(), dst as *mut u8, src.len());
+            }
+        } else {
+            unsafe {
+                std::ptr::copy(src.as_ptr(), dst as *mut u8, src.len());
+            }
         }
     }
     fn iput(&self, _pe: usize, src: &[u8], dst: usize) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(src.as_ptr(), dst as *mut u8, src.len());
+        let src_ptr = src.as_ptr();
+        if !((src_ptr as usize <= dst
+            && dst < src_ptr as usize + src.len()) //dst start overlaps src
+            || (src_ptr as usize <= dst + src.len()
+            && dst + src.len() < src_ptr as usize + src.len()))
+        //dst end overlaps src
+        {
+            unsafe {
+                std::ptr::copy_nonoverlapping(src.as_ptr(), dst as *mut u8, src.len());
+            }
+        } else {
+            unsafe {
+                std::ptr::copy(src.as_ptr(), dst as *mut u8, src.len());
+            }
         }
     }
     fn put_all(&self, src: &[u8], dst: usize) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(src.as_ptr(), dst as *mut u8, src.len());
+        let src_ptr = src.as_ptr();
+        if !((src_ptr as usize <= dst
+            && dst < src_ptr as usize + src.len()) //dst start overlaps src
+            || (src_ptr as usize <= dst + src.len()
+            && dst + src.len() < src_ptr as usize + src.len()))
+        //dst end overlaps src
+        {
+            unsafe {
+                std::ptr::copy_nonoverlapping(src.as_ptr(), dst as *mut u8, src.len());
+            }
+        } else {
+            unsafe {
+                std::ptr::copy(src.as_ptr(), dst as *mut u8, src.len());
+            }
         }
     }
     fn get(&self, _pe: usize, src: usize, dst: &mut [u8]) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(src as *mut u8, dst.as_mut_ptr(), dst.len());
+        let dst_ptr = dst.as_mut_ptr();
+        if !((dst_ptr as usize <= src && src < dst_ptr as usize + dst.len())
+            || (dst_ptr as usize <= src + dst.len()
+                && src + dst.len() < dst_ptr as usize + dst.len()))
+        {
+            unsafe {
+                std::ptr::copy_nonoverlapping(src as *mut u8, dst.as_mut_ptr(), dst.len());
+            }
+        } else {
+            unsafe {
+                std::ptr::copy(src as *mut u8, dst.as_mut_ptr(), dst.len());
+            }
         }
     }
 
     fn iget(&self, _pe: usize, src: usize, dst: &mut [u8]) {
-        unsafe {
-            std::ptr::copy_nonoverlapping(src as *mut u8, dst.as_mut_ptr(), dst.len());
+        let dst_ptr = dst.as_mut_ptr();
+        if !((dst_ptr as usize <= src && src < dst_ptr as usize + dst.len())
+            || (dst_ptr as usize <= src + dst.len()
+                && src + dst.len() < dst_ptr as usize + dst.len()))
+        {
+            unsafe {
+                std::ptr::copy_nonoverlapping(src as *mut u8, dst.as_mut_ptr(), dst.len());
+            }
+        } else {
+            unsafe {
+                std::ptr::copy(src as *mut u8, dst.as_mut_ptr(), dst.len());
+            }
         }
     }
 

From 74ac20cf8d7a9c20f5b47c2d3beb333ceac9ae4f Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 3 Oct 2024 22:00:48 -0700
Subject: [PATCH 079/116] fix lamellar task group doc tests

---
 src/lamellar_task_group.rs | 52 +++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index d3c9c6d4..d3487f33 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -1126,8 +1126,8 @@ impl LamellarResultDarcSerde for AmGroupAmReturn {}
 ///    foo: usize,
 /// }
 /// #[lamellar::am]
-/// impl LamellarAm for RingAm{
-///     async fn exec(self) -> Vec<usize>{
+/// impl LamellarAm for Am1{
+///     async fn exec(self) {
 ///         println!("in am1 {:?} on PE{:?}",self.foo,  lamellar::current_pe);
 ///     }
 /// }
@@ -1137,8 +1137,8 @@ impl LamellarResultDarcSerde for AmGroupAmReturn {}
 ///    bar: String,
 /// }
 /// #[lamellar::am]
-/// impl LamellarAm for RingAm{
-///     async fn exec(self) -> Vec<usize>{
+/// impl LamellarAm for Am2{
+///     async fn exec(self) {
 ///         println!("in am2 {:?} on PE{:?}",self.bar,lamellar::current_pe);
 ///     }
 /// }
@@ -1150,8 +1150,8 @@ impl LamellarResultDarcSerde for AmGroupAmReturn {}
 ///
 ///     let am1 = Am1{foo: 1};
 ///     let am2 = Am2{bar: "hello".to_string()};
-///     //create a new AMGroup
-///     let am_group = AMGroup::new(&world);
+///     //create a new AmGroup
+///     let mut am_group = AmGroup::new(&world);
 ///     // add the AMs to the group
 ///     // we can specify individual PEs to execute on or all PEs
 ///     am_group.add_am_pe(0,am1.clone());
@@ -1183,7 +1183,7 @@ pub struct AmGroup {
 }
 
 impl AmGroup {
-    /// create a new AMGroup associated with the given team
+    /// create a new AmGroup associated with the given team
     /// # Example
     /// ```
     /// use lamellar::active_messaging::prelude::*;
@@ -1191,7 +1191,7 @@ impl AmGroup {
     ///     let world = lamellar::LamellarWorldBuilder::new().build();
     ///     let my_pe = world.my_pe();
     ///     let num_pes = world.num_pes();
-    ///     let am_group = AMGroup::new(&world);
+    ///     let mut am_group = AmGroup::new(&world);
     /// }
     /// ```
     pub fn new<U: Into<IntoLamellarTeam>>(team: U) -> AmGroup {
@@ -1211,8 +1211,8 @@ impl AmGroup {
     ///    foo: usize,
     /// }
     /// #[lamellar::am]
-    /// impl LamellarAm for RingAm{
-    ///     async fn exec(self) -> Vec<usize>{
+    /// impl LamellarAm for Am1{
+    ///     async fn exec(self) {
     ///         println!("in am1 {:?} on PE{:?}",self.foo,  lamellar::current_pe);
     ///     }
     /// }
@@ -1222,8 +1222,8 @@ impl AmGroup {
     ///    bar: String,
     /// }
     /// #[lamellar::am]
-    /// impl LamellarAm for RingAm{
-    ///     async fn exec(self) -> Vec<usize>{
+    /// impl LamellarAm for Am2{
+    ///     async fn exec(self) {
     ///         println!("in am2 {:?} on PE{:?}",self.bar,lamellar::current_pe);
     ///     }
     /// }
@@ -1235,8 +1235,8 @@ impl AmGroup {
     ///
     ///     let am1 = Am1{foo: 1};
     ///     let am2 = Am2{bar: "hello".to_string()};
-    ///     //create a new AMGroup
-    ///     let am_group = AMGroup::new(&world);
+    ///     //create a new AmGroup
+    ///     let mut am_group = AmGroup::new(&world);
     ///     // add the AMs to the group
     ///     // we can specify individual PEs to execute on or all PEs
     ///     am_group.add_am_all(am1.clone());
@@ -1267,8 +1267,8 @@ impl AmGroup {
     ///    foo: usize,
     /// }
     /// #[lamellar::am]
-    /// impl LamellarAm for RingAm{
-    ///     async fn exec(self) -> Vec<usize>{
+    /// impl LamellarAm for Am1{
+    ///     async fn exec(self){
     ///         println!("in am1 {:?} on PE{:?}",self.foo,  lamellar::current_pe);
     ///     }
     /// }
@@ -1278,8 +1278,8 @@ impl AmGroup {
     ///    bar: String,
     /// }
     /// #[lamellar::am]
-    /// impl LamellarAm for RingAm{
-    ///     async fn exec(self) -> Vec<usize>{
+    /// impl LamellarAm for Am2{
+    ///     async fn exec(self) {
     ///         println!("in am2 {:?} on PE{:?}",self.bar,lamellar::current_pe);
     ///     }
     /// }
@@ -1291,8 +1291,8 @@ impl AmGroup {
     ///
     ///     let am1 = Am1{foo: 1};
     ///     let am2 = Am2{bar: "hello".to_string()};
-    ///     //create a new AMGroup
-    ///     let am_group = AMGroup::new(&world);
+    ///     //create a new AmGroup
+    ///     let mut am_group = AmGroup::new(&world);
     ///     // add the AMs to the group
     ///     // we can specify individual PEs to execute on or all PEs
     ///     am_group.add_am_pe(0,am1.clone());
@@ -1319,8 +1319,8 @@ impl AmGroup {
     ///    foo: usize,
     /// }
     /// #[lamellar::am]
-    /// impl LamellarAm for RingAm{
-    ///     async fn exec(self) -> Vec<usize>{
+    /// impl LamellarAm for Am1{
+    ///     async fn exec(self) {
     ///         println!("in am1 {:?} on PE{:?}",self.foo,  lamellar::current_pe);
     ///     }
     /// }
@@ -1330,8 +1330,8 @@ impl AmGroup {
     ///    bar: String,
     /// }
     /// #[lamellar::am]
-    /// impl LamellarAm for RingAm{
-    ///     async fn exec(self) -> Vec<usize>{
+    /// impl LamellarAm for Am2{
+    ///     async fn exec(self){
     ///         println!("in am2 {:?} on PE{:?}",self.bar,lamellar::current_pe);
     ///     }
     /// }
@@ -1343,8 +1343,8 @@ impl AmGroup {
     ///
     ///     let am1 = Am1{foo: 1};
     ///     let am2 = Am2{bar: "hello".to_string()};
-    ///     //create a new AMGroup
-    ///     let am_group = AMGroup::new(&world);
+    ///     //create a new AmGroup
+    ///     let mut am_group = AmGroup::new(&world);
     ///     // add the AMs to the group
     ///     // we can specify individual PEs to execute on or all PEs
     ///     am_group.add_am_pe(0,am1.clone());

From a329f266814847fa67f87fe504ae9a975275d827 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 3 Oct 2024 22:11:10 -0700
Subject: [PATCH 080/116] fix lamellarworld doc tests

---
 src/lamellar_world.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 727351c0..19793512 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -516,7 +516,7 @@ impl LamellarWorldBuilder {
     /// use lamellar::{LamellarWorldBuilder,ExecutorType};
     ///
     /// let builder = LamellarWorldBuilder::new()
-    ///                             .set_num_workers(10);
+    ///                             .set_num_threads(10);
     ///```
     //#[tracing::instrument(skip_all)]
     pub fn set_num_threads(mut self, num_threads: usize) -> LamellarWorldBuilder {

From 62909044a6cf05850aa971612d9ff5a2f8c964d9 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 3 Oct 2024 22:25:09 -0700
Subject: [PATCH 081/116] fix memregion doc tests

---
 src/memregion/one_sided.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index a299ed05..1c4abdee 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -549,15 +549,16 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
     /// #[am]
     /// impl LamellarAm for MemRegionAm{
     ///     async fn exec(self){
-    ///         let temp_buffer = OneSidedMemoryRegion<usize> = lamellar::world.alloc_one_sided_mem_region(mem_region.len());
+    ///         let temp_buffer: OneSidedMemoryRegion<usize> = lamellar::world.alloc_one_sided_mem_region(self.mem_region.len());
     ///         unsafe{ for elem in temp_buffer.as_mut_slice().expect("PE just created memregion"){ *elem = lamellar::current_pe}}
-    ///         unsafe{ self.mem_region.get_unchecked(lamellar::current_pe*temp_buffer.len(),temp_buffer)};
+    ///         unsafe{ self.mem_region.get_unchecked(lamellar::current_pe*temp_buffer.len(),temp_buffer.clone())};
     ///         unsafe {
     ///             for elem in temp_buffer.iter(){
     ///                 while *elem == lamellar::current_pe{
-    ///                     async_std::task::sleep(Duration::from_secs(self.secs)).await;
+    ///                     async_std::task::sleep(Duration::from_millis(100)).await;
     ///                 }
-    ///                 assert_eq!(lamellar::num_pes,*elem);
+    ///                 let num_pes = lamellar::num_pes;
+    ///                 assert_eq!(num_pes,*elem);
     ///             }
     ///         }
     ///     }
@@ -604,15 +605,16 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
     /// #[am]
     /// impl LamellarAm for MemRegionAm{
     ///     async fn exec(self){
-    ///         let temp_buffer = OneSidedMemoryRegion<usize> = lamellar::world.alloc_one_sided_mem_region(mem_region.len());
+    ///         let temp_buffer: OneSidedMemoryRegion<usize> = lamellar::world.alloc_one_sided_mem_region(self.mem_region.len());
     ///         unsafe{ for elem in temp_buffer.as_mut_slice().expect("PE just created memregion"){ *elem = lamellar::current_pe}}
-    ///         unsafe{ self.mem_region.get_unchecked(lamellar::current_pe*temp_buffer.len(),temp_buffer)};
+    ///         unsafe{ self.mem_region.get_unchecked(lamellar::current_pe*temp_buffer.len(),temp_buffer.clone())};
     ///         unsafe {
     ///             for elem in temp_buffer.iter(){
     ///                 while *elem == lamellar::current_pe{
-    ///                     async_std::task::sleep(Duration::from_secs(self.secs)).await;
+    ///                     async_std::task::sleep(Duration::from_millis(100)).await;
     ///                 }
-    ///                 assert_eq!(lamellar::num_pes,*elem);
+    ///                 let num_pes = lamellar::num_pes;
+    ///                 assert_eq!(num_pes,*elem);
     ///             }
     ///         }
     ///     }

From 48609032485df3b1274f583b87009f50f643d521 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 4 Oct 2024 10:43:34 -0700
Subject: [PATCH 082/116] fixed onesided iter doc tests

---
 examples/array_examples/onesided_iteration.rs | 251 ++++++++++--------
 src/array/iterator/one_sided_iterator.rs      |  43 ++-
 src/env_var.rs                                |   2 +
 src/lib.rs                                    |   2 +-
 4 files changed, 159 insertions(+), 139 deletions(-)

diff --git a/examples/array_examples/onesided_iteration.rs b/examples/array_examples/onesided_iteration.rs
index 23b471ca..1ffb25bf 100644
--- a/examples/array_examples/onesided_iteration.rs
+++ b/examples/array_examples/onesided_iteration.rs
@@ -1,119 +1,142 @@
+// use lamellar::array::prelude::*;
+// const ARRAY_LEN: usize = 100;
+
+// fn main() {
+//     let world = lamellar::LamellarWorldBuilder::new().build();
+//     let my_pe = world.my_pe();
+//     let _num_pes = world.num_pes();
+//     let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+//     let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
+
+//     //we are going to initialize the data on each PE by directly accessing its local data
+
+//     block_array
+//         .mut_local_data()
+//         .iter()
+//         .for_each(|e| e.store(my_pe));
+//     cyclic_array
+//         .mut_local_data()
+//         .iter()
+//         .for_each(|e| e.store(my_pe));
+
+//     // In this example we will make use of a onesided iterator which
+//     // enables us to iterate over the entire array on a single PE.
+//     // The runtime will manage transferring data from remote PEs.
+//     // Note that for UnsafeArrays, AtomicArrays, and LocalLockArrays,
+//     // there is no guarantee that by the time the transferred data
+//     // as arrived to the calling PE it has remained the same on the remote PE.
+//     // we do not currently provide a mutable one sided iterator.
+
+//     if my_pe == 0 {
+//         println!("Here");
+//         for elem in block_array.onesided_iter().into_iter() {
+//             //we can convert from a oneside iterator into a rust iterator
+//             print!("{:?} ", elem);
+//         }
+//         println!("");
+//         println!("Here2");
+//         for elem in cyclic_array.onesided_iter().into_iter() {
+//             print!("{:?} ", elem);
+//         }
+//         println!("");
+//     }
+//     println!("Here3");
+//     println!("--------------------------------------------------------");
+
+//     // The lamellar array iterator used above is lazy, meaning that it only accesses and returns a value as its used,
+//     // while this is generally efficent and results in low overhead, because an elem may actually exists on a remote node
+//     // latencies to retrieve the next value in the iterator are dependent on the location of the data, as a result of
+//     // the need to get the data. Further impacting performance is that typically the transfer of a single element will
+//     // likely be small, thus inefficiently utilizing network resources.
+//     // to address these issues, we have provided a buffered iterator, which will transfer "get" and store a block of data
+//     // into a buffer, from with the iterated values are returned. More effectively using network resources. From the users
+//     // standpoint the only thing that changes is the instatiation of the iterator.
+
+//     if my_pe == 0 {
+//         for elem in block_array.buffered_onesided_iter(10).into_iter() {
+//             print!("{:?} ", elem);
+//         }
+//         println!("");
+
+//         for elem in cyclic_array.buffered_onesided_iter(10).into_iter() {
+//             print!("{:?} ", elem);
+//         }
+//         println!("");
+//     }
+
+//     println!("--------------------------------------------------------");
+
+//     // in addition to the buffered iters we also provide a method to iterate over chunks of a lamellar array, via
+//     // the chunks() method. Called on a OneSidedIterator this creates a chunk sized OneSidedMemoryRegion,
+//     // and then puts the appropriate date based on the iteration index into that region
+
+//     if my_pe == 0 {
+//         for chunk in block_array.onesided_iter().chunks(10).skip(4).into_iter() {
+//             println!("{:?}", unsafe { chunk.as_slice() });
+//         }
+//         println!("-----");
+//         for chunk in cyclic_array.onesided_iter().chunks(10).into_iter() {
+//             println!("{:?}", unsafe { chunk.as_slice() });
+//         }
+
+//         println!("-----");
+//         for (i, (a, b)) in cyclic_array
+//             .onesided_iter()
+//             .zip(block_array.onesided_iter())
+//             .into_iter()
+//             .enumerate()
+//         {
+//             println!("{:?}: {:?} {:?}", i, a, b);
+//         }
+//         println!("-----");
+//         for (a, b) in cyclic_array
+//             .onesided_iter()
+//             .chunks(10)
+//             .zip(block_array.onesided_iter().chunks(10))
+//             .into_iter()
+//         {
+//             unsafe {
+//                 println!("{:?} {:?}", a.as_slice(), b.as_slice());
+//             }
+//         }
+//     }
+
+//     println!("--------------------------------------------------------");
+
+//     // let block_array = UnsafeArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+//     // for elem in block_onesided_iter!($array,array).into_iter().step_by(4) {...}
+//     // for elem in block_array.buffered_onesided_iter(10) {...}
+
+//     // //rust step_by pseudo code
+//     // fn step_by(&mut self, n: usize) -> Result<T>{
+//     //     let val = self.next(); //grab val based on index
+//     //     self.index += n;
+//     //     val
+//     // }
+
+//     // //--------------
+//     // for elem in block_array.onesided_iter().step_by(4).into_iter() {...}
+// }
+
+use futures_util::stream::StreamExt;
 use lamellar::array::prelude::*;
-const ARRAY_LEN: usize = 100;
-
 fn main() {
-    let world = lamellar::LamellarWorldBuilder::new().build();
+    let world = LamellarWorldBuilder::new().build();
+    let array = LocalLockArray::<usize>::new(&world, 8, Distribution::Block);
     let my_pe = world.my_pe();
-    let _num_pes = world.num_pes();
-    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
-
-    //we are going to initialize the data on each PE by directly accessing its local data
-
-    block_array
-        .mut_local_data()
-        .iter()
-        .for_each(|e| e.store(my_pe));
-    cyclic_array
-        .mut_local_data()
-        .iter()
-        .for_each(|e| e.store(my_pe));
-
-    // In this example we will make use of a onesided iterator which
-    // enables us to iterate over the entire array on a single PE.
-    // The runtime will manage transferring data from remote PEs.
-    // Note that for UnsafeArrays, AtomicArrays, and LocalLockArrays,
-    // there is no guarantee that by the time the transferred data
-    // as arrived to the calling PE it has remained the same on the remote PE.
-    // we do not currently provide a mutable one sided iterator.
-
-    if my_pe == 0 {
-        println!("Here");
-        for elem in block_array.onesided_iter().into_iter() {
-            //we can convert from a oneside iterator into a rust iterator
-            print!("{:?} ", elem);
-        }
-        println!("");
-        println!("Here2");
-        for elem in cyclic_array.onesided_iter().into_iter() {
-            print!("{:?} ", elem);
-        }
-        println!("");
-    }
-    println!("Here3");
-    println!("--------------------------------------------------------");
-
-    // The lamellar array iterator used above is lazy, meaning that it only accesses and returns a value as its used,
-    // while this is generally efficent and results in low overhead, because an elem may actually exists on a remote node
-    // latencies to retrieve the next value in the iterator are dependent on the location of the data, as a result of
-    // the need to get the data. Further impacting performance is that typically the transfer of a single element will
-    // likely be small, thus inefficiently utilizing network resources.
-    // to address these issues, we have provided a buffered iterator, which will transfer "get" and store a block of data
-    // into a buffer, from with the iterated values are returned. More effectively using network resources. From the users
-    // standpoint the only thing that changes is the instatiation of the iterator.
-
-    if my_pe == 0 {
-        for elem in block_array.buffered_onesided_iter(10).into_iter() {
-            print!("{:?} ", elem);
-        }
-        println!("");
-
-        for elem in cyclic_array.buffered_onesided_iter(10).into_iter() {
-            print!("{:?} ", elem);
-        }
-        println!("");
-    }
-
-    println!("--------------------------------------------------------");
-
-    // in addition to the buffered iters we also provide a method to iterate over chunks of a lamellar array, via
-    // the chunks() method. Called on a OneSidedIterator this creates a chunk sized OneSidedMemoryRegion,
-    // and then puts the appropriate date based on the iteration index into that region
-
-    if my_pe == 0 {
-        for chunk in block_array.onesided_iter().chunks(10).skip(4).into_iter() {
-            println!("{:?}", unsafe { chunk.as_slice() });
+    let num_pes = world.num_pes();
+    array.dist_iter_mut().for_each(move |e| *e = my_pe); //initialize array using a distributed iterator
+    array.wait_all();
+
+    world.block_on(async move {
+        if my_pe == 0 {
+            let result = array
+                .onesided_iter()
+                .into_stream()
+                .take(4)
+                .map(|elem| *elem as f64)
+                .all(|elem| async move { elem < num_pes as f64 });
+            assert_eq!(result.await, true);
         }
-        println!("-----");
-        for chunk in cyclic_array.onesided_iter().chunks(10).into_iter() {
-            println!("{:?}", unsafe { chunk.as_slice() });
-        }
-
-        println!("-----");
-        for (i, (a, b)) in cyclic_array
-            .onesided_iter()
-            .zip(block_array.onesided_iter())
-            .into_iter()
-            .enumerate()
-        {
-            println!("{:?}: {:?} {:?}", i, a, b);
-        }
-        println!("-----");
-        for (a, b) in cyclic_array
-            .onesided_iter()
-            .chunks(10)
-            .zip(block_array.onesided_iter().chunks(10))
-            .into_iter()
-        {
-            unsafe {
-                println!("{:?} {:?}", a.as_slice(), b.as_slice());
-            }
-        }
-    }
-
-    println!("--------------------------------------------------------");
-
-    // let block_array = UnsafeArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    // for elem in block_onesided_iter!($array,array).into_iter().step_by(4) {...}
-    // for elem in block_array.buffered_onesided_iter(10) {...}
-
-    // //rust step_by pseudo code
-    // fn step_by(&mut self, n: usize) -> Result<T>{
-    //     let val = self.next(); //grab val based on index
-    //     self.index += n;
-    //     val
-    // }
-
-    // //--------------
-    // for elem in block_array.onesided_iter().step_by(4).into_iter() {...}
+    });
 }
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index 51b04e57..5b67edd3 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -108,8 +108,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array = LocalLockArray::<usize>::new(&world,24,Distribution::Block);
     /// let my_pe = world.my_pe();
-    /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array using a distributed iterator
-    /// array.wait_all();
+    /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
     ///     for chunk in array.onesided_iter().chunks(5).into_iter() { //convert into a standard Iterator
     ///         // SAFETY: chunk is safe in self instance because self will be the only handle to the memory region,
@@ -142,8 +141,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
-    /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array using a distributed iterator
-    /// array.wait_all();
+    /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
     ///     for elem in array.onesided_iter().skip(3).into_iter() {  //convert into a standard Iterator
     ///         println!("PE: {my_pe} elem: {elem}");
@@ -174,8 +172,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
-    /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array using a distributed iterator
-    /// array.wait_all();
+    /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
     ///     for elem in array.onesided_iter().step_by(3).into_iter() { //convert into a standard Iterator
     ///         println!("PE: {my_pe} elem: {elem}");
@@ -207,8 +204,8 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// let array_B: LocalLockArray<usize> = LocalLockArray::new(&world,12,Distribution::Block);
     /// let my_pe = world.my_pe();
     /// //initialize arrays using a distributed iterator
-    /// array_A.dist_iter_mut().for_each(move|e| *e = my_pe);
-    /// array_B.dist_iter_mut().enumerate().for_each(move|(i,elem)| *elem = i);
+    /// let _ = array_A.dist_iter_mut().for_each(move|e| *e = my_pe).spawn();
+    /// let _ = array_B.dist_iter_mut().enumerate().for_each(move|(i,elem)| *elem = i).spawn();
     /// world.wait_all(); // instead of waiting on both arrays in separate calls, just wait for all tasks at the world level
     ///
     /// if my_pe == 0 {
@@ -252,10 +249,9 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
-    /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array using a distributed iterator
-    /// array.wait_all();
+    /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
-    ///     let sum = onesided_iter!($array,array).into_iter().take(4).map(|elem| *elem as f64).sum::<f64>();
+    ///     let sum = array.onesided_iter().into_iter().take(4).map(|elem| *elem as f64).sum::<f64>();
     ///     println!("Sum: {sum}")
     /// }
     /// ```
@@ -284,23 +280,22 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// # Examples
     ///```
     /// use lamellar::array::prelude::*;
+    /// use futures_util::stream::{StreamExt};
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
-    /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array using a distributed iterator
+    /// let num_pes = world.num_pes();
+    /// let _ =array.dist_iter_mut().for_each(move|e| *e = my_pe).spawn(); //initialize array using a distributed iterator
     /// array.wait_all();
+    ///
     /// world.block_on (async move {
-    ///     if my_pe == 0 {
-    ///         let sum = array.onesided_iter().into_stream().take(4).map(|elem| *elem as f64).sum::<f64>().await;
-    ///         println!("Sum: {sum}")
-    ///     }
-    /// });
+    ///      if my_pe == 0 {
+    ///          let result = array.onesided_iter().into_stream().take(4).map(|elem|*elem as f64).all(|elem|async move{ elem < num_pes as f64});
+    ///          assert_eq!(result.await, true);
+    ///      }
+    ///  });
     /// ```
-    ///  Output on a 4 PE execution
-    ///```text
-    /// Sum: 2.0
-    ///```
     fn into_stream(mut self) -> OneSidedStream<Self>
     where
         Self: Sized + Send,
@@ -324,7 +319,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
 /// let world = LamellarWorldBuilder::new().build();
 /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
 ///
-/// let std_iter = onesided_iter!($array,array).into_iter();
+/// let std_iter = array.onesided_iter().into_iter();
 /// for e in std_iter {
 ///     println!("{e}");
 /// }
@@ -352,12 +347,12 @@ where
 /// # Examples
 ///```
 /// use lamellar::array::prelude::*;
-/// use futures::stream::StreamExt;
+/// use futures_util::stream::StreamExt;
 ///
 /// let world = LamellarWorldBuilder::new().build();
 /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
 /// world.block_on(async move {
-///     let stream = array.onesided_iter().into_stream();
+///     let mut stream = array.onesided_iter().into_stream();
 ///     while let Some(e) = stream.next().await {
 ///         println!("{e}");
 ///     }
diff --git a/src/env_var.rs b/src/env_var.rs
index 3553ee77..5e785336 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -74,6 +74,8 @@ fn default_batcher() -> String {
 }
 
 fn default_threads() -> usize {
+    #[cfg(doctest)]
+    return 1;
     match std::thread::available_parallelism() {
         Ok(n) => n.into(),
         Err(_) => 4,
diff --git a/src/lib.rs b/src/lib.rs
index 3315fc8a..a5c35b71 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,6 +1,6 @@
 #![warn(missing_docs)]
 #![warn(unreachable_pub)]
-
+#![doc(test(attr(deny(unused_must_use))))]
 //! Lamellar is an investigation of the applicability of the Rust systems programming language for HPC as an alternative to C and C++, with a focus on PGAS approaches.
 //!
 //! # Some Nomenclature

From 46918b81a63b0e5e9b768384f2617932f4f93bfd Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 4 Oct 2024 15:04:50 -0700
Subject: [PATCH 083/116] fixed local iter doc tests

---
 src/array/iterator/local_iterator.rs          | 59 ++++++++-----------
 .../iterator/local_iterator/consumer/sum.rs   | 14 ++---
 src/array/unsafe/iteration/local.rs           |  2 +-
 3 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 9c53df04..01d7536f 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -108,7 +108,7 @@ pub trait LocalIteratorLauncher: InnerArray {
     consumer_impl!(
         sum<I>(iter: &I);
         [LocalIterSumHandle<I::Item>];
-        [I: LocalIterator + 'static, I::Item: SyncSend +  std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ]);
+        [I: LocalIterator + 'static, I::Item: SyncSend +  for<'a> std::iter::Sum<&'a I::Item> + std::iter::Sum<I::Item>, ]);
 
     //#[doc(hidden)]
     fn local_global_index_from_local(&self, index: usize, chunk_size: usize) -> Option<usize> {
@@ -182,10 +182,10 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// let init_iter = array.local_iter_mut().for_each(move|e| *e = my_pe); //initialize array
+    /// let init_iter = array.local_iter_mut().for_each(move|e| *e = my_pe).spawn(); //initialize array
     /// let filter_iter = array.local_iter()
     ///                        .enumerate() //we can call enumerate before the filter
-    ///                        .filter(|(_,e)| **e%2 == 1).for_each(move|(i,e)| println!("PE: {my_pe} i: {i} elem: {e}"));
+    ///                        .filter(|(_,e)| **e%2 == 1).for_each(move|(i,e)| println!("PE: {my_pe} i: {i} elem: {e}")).spawn();
     /// world.block_on(async move {
     ///     init_iter.await;
     ///     filter_iter.await;
@@ -215,8 +215,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter_mut().for_each(move|e| *e = my_pe); //initialize array
-    /// array.wait_all();
+    /// array.local_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array
     /// let filter_iter = array.local_iter()
     ///                        .enumerate() //we can call enumerate before the filter
     ///                        .filter_map(|(i,e)| {
@@ -250,8 +249,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter().map(|elem| *elem as f64).enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.local_iter().enumerate().map(|(i,elem)| (i,*elem as f64)).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -284,8 +282,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let array = LocalLockArray::<usize>::new(&world,16,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter_mut().for_each(move|e| *e = my_pe); //initialize array
-    /// array.wait_all();
+    /// array.local_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array
     /// let filter_iter = array.local_iter()
     ///                        .enumerate() //we can call enumerate before the filter
     ///                        .filter_map(|(i,e)| {
@@ -348,8 +345,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// array.local_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id()));
-    /// array.wait_all();
+    /// array.local_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id())).block();
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_with_schedule<F>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
@@ -419,8 +415,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// array.local_iter().for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
     ///     async_std::task::yield_now().await;
     ///     println!("{:?} {elem}",std::thread::current().id())
-    /// });
-    /// array.wait_all();
+    /// }).block();
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn for_each_async_with_schedule<F, Fut>(&self, sched: Schedule, op: F) -> LocalIterForEachHandle
@@ -443,7 +438,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.local_iter().reduce(|acc,elem| acc+elem);
+    /// let req = array.local_iter().map(|elem| *elem).reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -468,7 +463,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.local_iter().reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
+    /// let req = array.local_iter().map(|elem| *elem).reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -498,7 +493,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
     ///
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
+    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -524,7 +519,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
     ///
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().map(elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Scheduler::WorkStealing,Distribution::Cyclic);
+    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Schedule::WorkStealing,Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -623,7 +618,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -697,13 +692,13 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.local_iter().sum();
+    /// let req = array.local_iter().map(|elem| *elem).sum().spawn();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn sum(&self) -> LocalIterSumHandle<Self::Item>
     where
-        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
+        Self::Item: SyncSend + for<'a> std::iter::Sum<&'a Self::Item> + std::iter::Sum<Self::Item>,
     {
         self.array().sum(self)
     }
@@ -724,13 +719,13 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.local_iter().sum_with_schedule(Schedule::Guided);
+    /// let req = array.local_iter().map(|elem| *elem).sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
     fn sum_with_schedule(&self, sched: Schedule) -> LocalIterSumHandle<Self::Item>
     where
-        Self::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a Self::Item>,
+        Self::Item: SyncSend + for<'a> std::iter::Sum<&'a Self::Item> + std::iter::Sum<Self::Item>,
     {
         self.array().sum_with_schedule(sched, self)
     }
@@ -748,8 +743,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter().enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.local_iter().enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -788,8 +782,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// array.local_iter().chunks(5).enumerate().for_each(move|(i,chunk)| {
     ///     let chunk_vec: Vec<usize> = chunk.map(|elem| *elem).collect();
     ///     println!("PE: {my_pe} i: {i} chunk: {chunk_vec:?}");
-    /// });
-    /// array.wait_all();
+    /// }).block();
     /// ```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -848,8 +841,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter().enumerate().skip(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.local_iter().enumerate().skip(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -872,7 +864,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,28,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter().enumerate().step_by(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
+    /// let _ =array.local_iter().enumerate().step_by(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).spawn();
     /// array.wait_all();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
@@ -904,8 +896,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter().enumerate().take(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.local_iter().enumerate().take(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -939,11 +930,9 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// let my_pe = world.my_pe();
     ///
     /// //initalize array_B
-    /// array_B.local_iter_mut().enumerate().for_each(move|(i,elem)| *elem = i);
-    /// array_B.wait_all();
+    /// array_B.local_iter_mut().enumerate().for_each(move|(i,elem)| *elem = i).block();
     ///
-    /// array_A.local_iter().zip(array_B.local_iter()).for_each(move|(elem_A,elem_B)| println!("PE: {my_pe} A: {elem_A} B: {elem_B}"));
-    /// array_A.wait_all();
+    /// array_A.local_iter().zip(array_B.local_iter()).for_each(move|(elem_A,elem_B)| println!("PE: {my_pe} A: {elem_A} B: {elem_B}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index e13747b1..366d76f9 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -31,7 +31,7 @@ impl<I: IterClone> IterClone for Sum<I> {
 impl<I> IterConsumer for Sum<I>
 where
     I: LocalIterator + 'static,
-    I::Item: SyncSend + std::iter::Sum,
+    I::Item: SyncSend + for<'a> std::iter::Sum<&'a I::Item> + std::iter::Sum<I::Item>,
 {
     type AmOutput = I::Item;
     type Output = I::Item;
@@ -79,7 +79,7 @@ enum InnerState<T> {
 
 impl<T> Future for InnerLocalIterSumHandle<T>
 where
-    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+    T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     type Output = T;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
@@ -110,7 +110,7 @@ where
 //#[doc(hidden)]
 impl<T> LamellarRequest for InnerLocalIterSumHandle<T>
 where
-    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+    T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     fn blocking_wait(mut self) -> Self::Output {
         self.reqs
@@ -143,7 +143,7 @@ pub struct LocalIterSumHandle<T> {
 
 impl<T> LocalIterSumHandle<T>
 where
-    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+    T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     pub(crate) fn new(
         inner: Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>,
@@ -176,7 +176,7 @@ enum State<T> {
 }
 impl<T> Future for LocalIterSumHandle<T>
 where
-    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+    T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     type Output = T;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
@@ -203,7 +203,7 @@ where
 //#[doc(hidden)]
 impl<T> LamellarRequest for LocalIterSumHandle<T>
 where
-    T: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a T> + 'static,
+    T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     fn blocking_wait(self) -> Self::Output {
         match self.state {
@@ -249,7 +249,7 @@ impl<I: IterClone> IterClone for SumAm<I> {
 impl<I> LamellarAm for SumAm<I>
 where
     I: LocalIterator + 'static,
-    I::Item: SyncSend + std::iter::Sum,
+    I::Item: SyncSend + for<'a> std::iter::Sum<&'a I::Item> + std::iter::Sum<I::Item>,
 {
     async fn exec(&self) -> I::Item {
         let iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index ff52ac26..f691fab7 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -145,7 +145,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
     consumer_impl!(
     sum<I>(iter: &I);
     [LocalIterSumHandle<I::Item>];
-    [I: LocalIterator + 'static, I::Item: SyncSend + std::iter::Sum + for<'a> std::iter::Sum<&'a I::Item> , ];
+    [I: LocalIterator + 'static, I::Item: SyncSend + for<'a> std::iter::Sum<&'a I::Item> + std::iter::Sum<I::Item>  , ];
     [
         Sum {
             iter: iter.iter_clone(Sealed),

From ec73fb9e0dc6f8704e64592fb9d325acfc06fd95 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 10:26:59 -0700
Subject: [PATCH 084/116] fix distributed iteration doc tests

---
 src/array/iterator/distributed_iterator.rs | 41 +++++++++-------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 91b4a6fe..fe7ee6d4 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -188,7 +188,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// let init_iter = array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array
+    /// let init_iter = array.dist_iter_mut().for_each(move|e| *e = my_pe).spawn(); //initialize array
     /// let filter_iter = array.dist_iter()
     ///                        .enumerate() //we can call enumerate before the filter
     ///                        .filter(|(_,e)| *e%2 == 1).for_each(move|(i,e)| println!("PE: {my_pe} i: {i} elem: {e}"));
@@ -221,8 +221,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.dist_iter_mut().for_each(move|e| *e = my_pe); //initialize array
-    /// array.wait_all();
+    /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block();
     /// let filter_iter = array.dist_iter()
     ///                        .enumerate() //we can call enumerate before the filter
     ///                        .filter_map(|(i,e)| {
@@ -256,8 +255,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.dist_iter().map(|elem| *elem as f64).enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.dist_iter().map(|elem| *elem as f64).monotonic().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -290,8 +288,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let array = LocalLockArray::<usize>::new(&world,16,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_iter_mut().for_each(move|e| *e = my_pe); //initialize array
-    /// array.wait_all();
+    /// array.local_iter_mut().for_each(move|e| *e = my_pe).block();
     /// let filter_iter = array.local_iter()
     ///                        .enumerate() //we can call enumerate before the filter
     ///                        .filter_map(|(i,e)| {
@@ -425,7 +422,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let iter = array.dist_iter().spawn_for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
+    /// let iter = array.dist_iter().for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
     ///     async_std::task::yield_now().await;
     ///     println!("{:?} {elem}",std::thread::current().id())
     /// });
@@ -454,7 +451,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.dist_iter().reduce(|acc,elem| acc+elem);
+    /// let req = array.dist_iter().map(|elem| *elem).reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -480,7 +477,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.dist_iter().reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
+    /// let req = array.dist_iter().map(|elem| *elem).reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -551,7 +548,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///
     /// let req = array.dist_iter()
     ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
-    ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
+    ///                .filter(|elem| * elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
     ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
@@ -651,7 +648,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Scheduler::Dynamic, Distribution::Cyclic);
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -683,7 +680,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.dist_iter().filter(|elem|  elem < 10).count();
+    /// let req = array.dist_iter().filter(|elem|  **elem < 10).count();
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -705,7 +702,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.dist_iter().filter(|elem|  elem < 10).count_with_schedule(Schedule::Dynamic);
+    /// let req = array.dist_iter().filter(|elem|  **elem < 10).count_with_schedule(Schedule::Dynamic);
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
     ///```
     fn count_with_schedule(&self, sched: Schedule) -> DistIterCountHandle {
@@ -730,7 +727,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.dist_iter().sum();
+    /// let req = array.dist_iter().map(|elem| *elem).sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -759,7 +756,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
     ///
-    /// let req = array.dist_iter().sum_with_schedule(Schedule::Guided);
+    /// let req = array.dist_iter().map(|elem| *elem).sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -783,8 +780,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + IterClone
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.dist_iter().enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.dist_iter().enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -811,8 +807,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + IterClone
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.dist_iter().enumerate().skip(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.dist_iter().enumerate().skip(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -836,8 +831,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + IterClone
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.dist_iter().enumerate().step_by(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.dist_iter().enumerate().step_by(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text
@@ -859,8 +853,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + IterClone
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.dist_iter().enumerate().take(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
-    /// array.wait_all();
+    /// array.dist_iter().enumerate().take(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
     ///```
     /// Possible output on a 4 PE (1 thread/PE) execution (ordering is likey to be random with respect to PEs)
     ///```text

From c436f88f7aa5f4b4df2758b4b600ef31885f5945 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 10:31:36 -0700
Subject: [PATCH 085/116] fix iteration doc tests

---
 src/array/iterator/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array/iterator/mod.rs b/src/array/iterator/mod.rs
index 389abc9d..57d59234 100644
--- a/src/array/iterator/mod.rs
+++ b/src/array/iterator/mod.rs
@@ -115,7 +115,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
     ///
     /// if my_pe == 0 {
-    ///     for elem in onesided_iter!($array,array).into_iter() { //"into_iter()" converts into a standard Rust Iterator
+    ///     for elem in array.onesided_iter().into_iter() { //"into_iter()" converts into a standard Rust Iterator
     ///         println!("PE{my_pe} elem {elem}");
     ///     }
     /// }

From e578726e6cfc0464ef4290be2fd3188dccd3f759 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 11:03:14 -0700
Subject: [PATCH 086/116] fix atomic array doc tests

---
 src/array.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/array.rs b/src/array.rs
index 00eb7e57..751ef69e 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -1078,7 +1078,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```no_run //assert is for 4 PEs
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<i8> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<u8> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
     ///
     /// assert_eq!(25,array.num_elems_local());
     ///```
@@ -1221,13 +1221,13 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic);
     /// // cyclic array index location = PE0 [0,4,8,12], PE1 [1,5,9,13], PE2 [2,6,10,14], PE3 [3,7,11,15]
     /// let Some((pe,offset)) = cyclic_array.pe_and_offset_for_global_index(6) else { panic!("out of bounds");};
-    /// let index = block_array.first_global_index_for_pe(0).unwrap();
+    /// let index = cyclic_array.first_global_index_for_pe(0).unwrap();
     /// assert_eq!(index , 0);
-    /// let index = block_array.first_global_index_for_pe(1).unwrap();
+    /// let index = cyclic_array.first_global_index_for_pe(1).unwrap();
     /// assert_eq!(index , 1);
-    /// let index = block_array.first_global_index_for_pe(2).unwrap();
+    /// let index = cyclic_array.first_global_index_for_pe(2).unwrap();
     /// assert_eq!(index , 2);
-    /// let index = block_array.first_global_index_for_pe(3).unwrap();
+    /// let index = cyclic_array.first_global_index_for_pe(3).unwrap();
     /// assert_eq!(index , 3);
     ///```
     fn first_global_index_for_pe(&self, pe: usize) -> Option<usize>;
@@ -1265,13 +1265,13 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic);
     /// // cyclic array index location = PE0 [0,4,8,12], PE1 [1,5,9,13], PE2 [2,6,10,14], PE3 [3,7,11,15]
     /// let Some((pe,offset)) = cyclic_array.pe_and_offset_for_global_index(6) else { panic!("out of bounds");};
-    /// let index = block_array.last_global_index_for_pe(0).unwrap();
+    /// let index = cyclic_array.last_global_index_for_pe(0).unwrap();
     /// assert_eq!(index , 12);
-    /// let index = block_array.last_global_index_for_pe(1).unwrap();
+    /// let index = cyclic_array.last_global_index_for_pe(1).unwrap();
     /// assert_eq!(index , 13);
-    /// let index = block_array.last_global_index_for_pe(2).unwrap();
+    /// let index = cyclic_array.last_global_index_for_pe(2).unwrap();
     /// assert_eq!(index , 14);
-    /// let index = block_array.last_global_index_for_pe(3).unwrap();
+    /// let index = cyclic_array.last_global_index_for_pe(3).unwrap();
     /// assert_eq!(index , 15);
     ///```
     fn last_global_index_for_pe(&self, pe: usize) -> Option<usize>;

From 3818e0ab39c117e3f74b872cea4ec29d5ee3f73c Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 11:51:06 -0700
Subject: [PATCH 087/116] fix atomic array doc tests

---
 src/array.rs        |  4 ++--
 src/array/atomic.rs | 30 +++++++++++++++++-------------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/array.rs b/src/array.rs
index 751ef69e..a7c25a46 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -194,8 +194,8 @@ crate::inventory::collect!(ReduceKey);
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
 lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f64);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f64);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index f2a8fa4b..f404d116 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1301,11 +1301,13 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
+    /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
     ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
-    /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
+    /// }).block();
+    /// world.wait_all();
+    /// world.barrier();
+    /// let sum = array.block_on(array.reduce("sum")).expect("array has length > 0"); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1348,11 +1350,13 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
+    /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
-    /// let sum = array.block_on(array.sum());
+    ///     array_clone.add(index,1); //randomly add one to an element in the array.
+    /// }).block();
+    /// world.wait_all();
+    /// world.barrier();
+    /// let sum = array.block_on(array.sum()).expect("array has length > 0");
     /// assert_eq!(array.len()*num_pes,sum);
     /// ```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1393,10 +1397,10 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
     ///     elem.store(i+1);
-    /// });
+    /// }).spawn();
     /// array.wait_all();
     /// array.barrier();
-    /// let prod =  array.block_on(array.prod());
+    /// let prod =  array.block_on(array.prod()).expect("array has length > 0");
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1436,8 +1440,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let max = array.block_on(array.max());
+    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
+    /// let max = array.block_on(array.max()).expect("array has length > 0");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1478,8 +1482,8 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-    /// let min = array.block_on(array.min());
+    /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();;
+    /// let min = array.block_on(array.min()).expect("array has length > 0");
     /// assert_eq!(0,min);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]

From 0025278c8747a660f0234afce4078a617d37c9e5 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 12:09:53 -0700
Subject: [PATCH 088/116] fix global lock array doc tests

---
 src/array/global_lock_atomic.rs | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 35fbdb44..8a3459fc 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -200,7 +200,7 @@ impl<T: Dist> GlobalLockLocalData<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.read_local_data();
+    /// let local_data = array.blocking_read_lock().local_data();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
@@ -766,7 +766,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.local_data();
+    /// let slice = array1.blocking_read_lock().local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_unsafe" call
     /// // but array1 will not be dropped until after 'slice' is dropped.
@@ -814,7 +814,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = unsafe {array1.local_data()};
+    /// let slice = array1.blocking_read_lock().local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -858,7 +858,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = unsafe {array1.local_data()};
+    /// let slice = array1.blocking_read_lock().local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -904,7 +904,7 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = unsafe {array1.local_data()};
+    /// let slice = array1.blocking_read_lock().local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_atomic" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -1290,9 +1290,9 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
-    /// let prod = array.block_on(read_guard.reduce("prod"));
+    /// let prod = array.block_on(read_guard.reduce("prod")).expect("array has > 0 elements");
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn reduce(self, op: &str) -> GlobalLockArrayReduceHandle<T> {
@@ -1323,9 +1323,9 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
-    /// let sum = array.block_on(read_guard.sum());
+    /// let sum = array.block_on(read_guard.sum()).expect("array has > 0 elements");
     /// ```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
     pub fn sum(self) -> GlobalLockArrayReduceHandle<T> {
@@ -1351,9 +1351,9 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i+1));
     /// let read_guard = array.blocking_read_lock();
-    /// let prod = array.block_on(read_guard.prod());
+    /// let prod = array.block_on(read_guard.prod()).expect("array has > 0 elements");
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1381,9 +1381,9 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
-    /// let max = array.block_on(read_guard.max());
+    /// let max = array.block_on(read_guard.max()).expect("array has > 0 elements");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1410,9 +1410,9 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
-    /// let min = array.block_on(read_guard.min());
+    /// let min = array.block_on(read_guard.min()).expect("array has > 0 elements");
     /// assert_eq!(0,min);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]

From e64358beec4c48b1a9b1fd0cf4d523342280e405 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 12:28:51 -0700
Subject: [PATCH 089/116] fix local lock array doc tests

---
 src/array/global_lock_atomic.rs             | 10 ++++----
 src/array/local_lock_atomic.rs              | 26 ++++++++++-----------
 src/array/local_lock_atomic/local_chunks.rs | 17 +++++++-------
 3 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 8a3459fc..51800bb3 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -200,7 +200,7 @@ impl<T: Dist> GlobalLockLocalData<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.blocking_read_lock().local_data();
+    /// let local_data = array.blocking_read_local_data();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
@@ -766,7 +766,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_lock().local_data();
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_unsafe" call
     /// // but array1 will not be dropped until after 'slice' is dropped.
@@ -814,7 +814,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_lock().local_data();
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -858,7 +858,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_lock().local_data();
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -904,7 +904,7 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_lock().local_data();
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_atomic" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 1c3389f2..8dbce07c 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -166,7 +166,7 @@ impl<'a, T: Dist> LocalLockLocalData<T> {
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.read_local_data();
+    /// let local_data = array.blocking_read_local_data();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
@@ -653,7 +653,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.local_data();
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_unsafe" call
     /// // but array1 will not be dropped until after 'slice' is dropped.
@@ -701,7 +701,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = unsafe {array1.local_data()};
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -745,7 +745,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = unsafe {array1.local_data()};
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_global_lock" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -791,7 +791,7 @@ impl<T: Dist + 'static> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = unsafe {array1.local_data()};
+    /// let slice = array1.blocking_read_local_data();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_atomic" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -1178,7 +1178,7 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
     /// let prod = array.block_on(read_guard.reduce("prod"));
     ///```
@@ -1213,7 +1213,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
     /// let sum = array.block_on(read_guard.sum());
     /// ```
@@ -1243,9 +1243,9 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i+1));
     /// let read_guard = array.blocking_read_lock();
-    /// let prod = array.block_on(read_guard.prod());
+    /// let prod = array.block_on(read_guard.prod()).expect("array len > 0");
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1275,9 +1275,9 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
-    /// let max = array.block_on(read_guard.max());
+    /// let max = array.block_on(read_guard.max()).expect("array len > 0");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1306,9 +1306,9 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
-    /// array.block_on(array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)));
+    /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.blocking_read_lock();
-    /// let min = array.block_on(read_guard.min());
+    /// let min = array.block_on(read_guard.min()).expect("array len > 0");
     /// assert_eq!(0,min);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index 47b6ee9c..a8b1739a 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -228,9 +228,9 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
     /// world.block_on(async move {
-    ///     array.read_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
+    ///     let _ = array.read_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
     ///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    ///     });
+    ///     }).spawn();
     ///     array.await_all().await;
     /// });
     /// ```
@@ -259,10 +259,9 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.blocking_read_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    /// let _ = array.blocking_read_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
     ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    /// });
-    /// array.wait_all();
+    /// }).block();
     ///
     /// ```
     pub fn blocking_read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
@@ -299,9 +298,9 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
     /// world.block_on(async move {
-    ///     array.write_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
+    ///     let _ = array.write_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
     ///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    ///     });
+    ///     }).spawn();
     ///     array.await_all().await;
     /// });
     /// ```
@@ -330,9 +329,9 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.blocking_write_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    /// let _ = array.blocking_write_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
     ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    /// });
+    /// }).spawn();
     /// array.wait_all();
     ///
     /// ```

From 56d2c6830fb6ba76bb105827c6129b6eae65a340 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 13:59:50 -0700
Subject: [PATCH 090/116] fix read only  array doc tests

---
 src/array/read_only.rs              | 32 ++++++++++++++++-------------
 src/array/read_only/local_chunks.rs |  4 ++--
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index be11bd94..61af4425 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -456,12 +456,13 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
+    /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
     ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
+    /// }).block();
+    /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
+    /// let sum = array.block_on(array.reduce("sum")).expect("array len > 0"); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -488,12 +489,13 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
+    /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
     ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
+    /// }).block();
+    /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let sum = array.block_on(array.sum());
+    /// let sum = array.block_on(array.sum()).expect("array len > 0");
     /// assert_eq!(array.len()*num_pes,sum);
     /// ```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -517,12 +519,12 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
+    /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| {
     ///     elem.store(i+1);
-    /// });
+    /// }).block();
     /// array.wait_all();
-    /// array.barrier();
-    /// let prod =  array.block_on(array.prod());
+    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let prod =  array.block_on(array.prod()).expect("array len > 0");
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -547,9 +549,10 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
+    /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let max = array.block_on(array.max());
+    /// let max = array.block_on(array.max()).expect("array len > 0");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -573,9 +576,10 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
-    /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
+    /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
+    /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let min = array.block_on(array.min());
+    /// let min = array.block_on(array.min()).expect("array len > 0");
     /// assert_eq!(0,min);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
diff --git a/src/array/read_only/local_chunks.rs b/src/array/read_only/local_chunks.rs
index 3f3b098a..e8f34bf3 100644
--- a/src/array/read_only/local_chunks.rs
+++ b/src/array/read_only/local_chunks.rs
@@ -100,9 +100,9 @@ impl<T: Dist> ReadOnlyArray<T> {
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    /// let _ = array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
     ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    /// });
+    /// }).spawn();
     /// array.wait_all();
     ///
     /// ```

From 1907d4a0f444342ae9a120a75083027e7e71747c Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 14:23:02 -0700
Subject: [PATCH 091/116] fix arithmetic ops  doc tests

---
 src/array.rs                       |  8 ++++----
 src/array/operations/arithmetic.rs | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/array.rs b/src/array.rs
index a7c25a46..6de961b2 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -191,11 +191,11 @@ pub struct ReduceKey {
 crate::inventory::collect!(ReduceKey);
 
 // impl Dist for bool {}
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
 
-lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
diff --git a/src/array/operations/arithmetic.rs b/src/array/operations/arithmetic.rs
index 1db86f5a..635e36ab 100644
--- a/src/array/operations/arithmetic.rs
+++ b/src/array/operations/arithmetic.rs
@@ -887,7 +887,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
 ///
 /// let vals = vec![3,54,12,88,29,68];
 /// let index = 10;
@@ -1075,7 +1075,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1111,7 +1111,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_sub(indices,10) };
@@ -1149,7 +1149,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1188,7 +1188,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_sub(indices,10) };

From f6693c8c997a9c5fce164efeffe217ddf8f0000c Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 7 Oct 2024 21:19:00 -0700
Subject: [PATCH 092/116] fix shift  ops  doc tests

---
 src/array.rs                  | 4 ++--
 src/array/operations/shift.rs | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/array.rs b/src/array.rs
index 6de961b2..2c3cfe34 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -194,8 +194,8 @@ crate::inventory::collect!(ReduceKey);
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
 lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index a477cb6c..81a083e1 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -67,7 +67,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,4,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -216,7 +216,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,4,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -367,7 +367,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,4,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -516,7 +516,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,4,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
     ///
     /// let idx = 53;
     /// let val = 2;

From 369ed034105566f5cb7768ffde63599a713867cf Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 8 Oct 2024 09:58:31 -0700
Subject: [PATCH 093/116] fix array  doc tests

---
 src/array.rs                     | 54 +++++++++++++++++---------------
 src/array/unsafe.rs              | 25 +++++++++------
 src/array/unsafe/iteration.rs    |  2 +-
 src/array/unsafe/local_chunks.rs | 12 ++++---
 src/array/unsafe/rdma.rs         | 10 +++---
 5 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/src/array.rs b/src/array.rs
index 2c3cfe34..e5bf53c3 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -1392,7 +1392,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     /// let my_pe = world.my_pe();
     /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block);
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
-    /// array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i); //we will used this val as completion detection
+    /// let _ = array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn(); //we will used this val as completion detection
     /// unsafe { // we just created buf and have not shared it so free to mutate safely
     ///     for elem in buf.as_mut_slice()
     ///                          .expect("we just created it so we know its local") { //initialize mem_region
@@ -1454,10 +1454,9 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block);
-    /// array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = my_pe); //we will used this val as completion detection
-    /// array.wait_all();
+    /// let _ = array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = my_pe).block(); //we will used this val as completion detection
     /// array.barrier();
-    /// println!("PE{my_pe} array data: {:?}",array.read_local_data());
+    /// println!("PE{my_pe} array data: {:?}",array.blocking_read_local_data());
     /// let index = ((my_pe+1)%num_pes) * array.num_elems_local(); // get first index on PE to the right (with wrap arround)
     /// let at_req = array.at(index);
     /// let val = array.block_on(at_req);
@@ -1533,7 +1532,7 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block);
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// let len = buf.len();
-    /// array.dist_iter_mut().for_each(move |elem| *elem = len); //we will used this val as completion detection
+    /// let _ = array.dist_iter_mut().for_each(move |elem| *elem = len).spawn(); //we will used this val as completion detection
     ///
     /// //Safe as we are this is the only reference to buf   
     /// unsafe {
@@ -1546,14 +1545,14 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     /// }
     /// array.wait_all();
     /// array.barrier();
-    /// println!("PE{my_pe} array data: {:?}",array.local_data());
+    /// println!("PE{my_pe} array data: {:?}",array.blocking_read_local_data());
     /// if my_pe == 0 { //only perfrom the transfer from one PE
     ///     array.block_on( unsafe {  array.put(0,&buf) } );
     ///     println!();
     /// }
     /// array.barrier(); //block other PEs until PE0 has finised "putting" the data
     ///    
-    /// println!("PE{my_pe} array data: {:?}",array.local_data());
+    /// println!("PE{my_pe} array data: {:?}",array.blocking_read_local_data());
     ///     
     ///
     ///```
@@ -1603,10 +1602,13 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
     /// let block_array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
     /// let cyclic_array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
     ///
-    /// block_array.dist_iter().zip(cyclic_array.dist_iter()).enumerate().for_each(move |i,(a,b)| {
-    ///     a.store(i);
-    ///     b.store(i);
-    /// });
+    /// let _ = block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
+    ///     elem.store(i);
+    /// }).spawn();
+    /// let _ =cyclic_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
+    ///     elem.store(i);
+    /// }).spawn();
+    /// world.wait_all();
     /// block_array.print();
     /// println!();
     /// cyclic_array.print();
@@ -1659,11 +1661,11 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// use rand::Rng;
 ///
 /// let array_clone = array.clone();
-/// array.local_iter().for_each(move |_| {
+/// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
 ///     array_clone.add(index,1); //randomly at one to an element in the array.
-/// });
-/// let sum = array.block_on(array.sum()); // atomic updates still possibly happening, output non deterministic
+/// }).block();
+/// let sum = array.block_on(array.sum()).expect("array len > 0"); // atomic updates still possibly happening, output non deterministic
 /// println!("sum {sum}");
 ///```
 /// Waiting for local operations to finish not enough by itself
@@ -1679,7 +1681,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// });
 /// array.block_on(req);// this is not sufficient, we also need to "wait_all" as each "add" call is another request
 /// array.wait_all();
-/// let sum = array.block_on(array.sum()); // atomic updates still possibly happening (on remote nodes), output non deterministic
+/// let sum = array.block_on(array.sum()).expect("array len > 0"); // atomic updates still possibly happening (on remote nodes), output non deterministic
 /// println!("sum {sum}");
 ///```
 /// Need to add a barrier after local operations on all PEs have finished
@@ -1697,7 +1699,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// array.block_on(req);// this is not sufficient, we also need to "wait_all" as each "add" call is another request
 /// array.wait_all();
 /// array.barrier();
-/// let sum = array.block_on(array.sum()); // No updates occuring anywhere anymore so we have a deterministic result
+/// let sum = array.block_on(array.sum()).expect("array len > 0"); // No updates occuring anywhere anymore so we have a deterministic result
 /// assert_eq!(array.len()*num_pes,sum);
 ///```
 /// Alternatively we can convert our AtomicArray into a ReadOnlyArray before the reduction
@@ -1708,12 +1710,12 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// let num_pes = world.num_pes();
 /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
 /// let array_clone = array.clone();
-/// let req = array.local_iter().for_each(move |_| {
+/// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
 ///     array_clone.add(index,1); //randomly at one to an element in the array.
-/// });
+/// }).block();
 /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-/// let sum = array.block_on(array.sum()); // No updates occuring anywhere anymore so we have a deterministic result
+/// let sum = array.block_on(array.sum()).expect("array len > 0"); // No updates occuring anywhere anymore so we have a deterministic result
 /// assert_eq!(array.len()*num_pes,sum);
 ///```
 /// Finally we are inlcuding a `Arc<Vec<AtomicUsize>>` highlightin the same issue
@@ -1772,12 +1774,12 @@ where
     /// let num_pes = world.num_pes();
     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().for_each(move |_| {
+    /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
     ///     array_clone.add(index,1); //randomly at one to an element in the array.
-    /// });
+    /// }).block();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
-    /// let sum = array.block_on(array.reduce("sum")); // equivalent to calling array.sum()
+    /// let sum = array.block_on(array.reduce("sum")).expect("array len > 0"); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
     fn reduce(&self, reduction: &str) -> Self::Handle;
@@ -1915,18 +1917,18 @@ where
 /// register_reduction!(
 ///     my_sum, // the name of our new reduction
 ///     |acc,elem| acc+elem , //the reduction closure
-///     usize, // will be implementd for usize,f32, and i32
+///     usize, // will be implementd for usize,f32, and u8
 ///     f32,
-///     i32,
+///     u8,
 /// );
 /// let world = LamellarWorldBuilder::new().build();
 /// let num_pes = world.num_pes();
 /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
 /// let array_clone = array.clone();
-/// let req = array.local_iter().for_each(move |_| {
+/// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
 ///     array_clone.add(index,1); //randomly at one to an element in the array.
-/// });
+/// }).block();
 /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
 /// let sum = array.block_on(array.sum());
 /// let my_sum = array.block_on(array.reduce("my_sum")); //pass a &str containing the reduction to use
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index e9129c70..26e2217c 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1280,10 +1280,15 @@ impl<T: Dist + std::fmt::Debug> UnsafeArray<T> {
     /// let block_array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
     /// let cyclic_array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
     ///
-    /// block_array.dist_iter().zip(cyclic_array.dist_iter()).enumerate().for_each(move |i,(a,b)| {
-    ///     a.store(i);
-    ///     b.store(i);
-    /// });
+    /// unsafe{
+    ///     let _ =block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
+    ///         *elem = i;
+    ///     }).spawn();
+    ///     let _ = cyclic_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
+    ///         *elem = i;
+    ///     }).spawn();
+    /// }
+    /// world.wait_all();
     /// block_array.print();
     /// println!();
     /// cyclic_array.print();
@@ -1441,12 +1446,12 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// unsafe {
     ///     let req = array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
     ///         *elem = i+1;
-    ///     });
+    ///     }).spawn();
     /// }
     /// array.print();
     /// array.wait_all();
     /// array.print();
-    /// let prod = unsafe{ array.block_on(array.prod())};
+    /// let prod = unsafe{ array.block_on(array.prod()).expect("array len > 0")};
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1475,11 +1480,11 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let num_pes = world.num_pes();
     /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
     /// let array_clone = array.clone();
-    /// unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2)}; //safe as we are accessing in a data parallel fashion
+    /// let _ = unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2).spawn()}; //safe as we are accessing in a data parallel fashion
     /// array.wait_all();
     /// array.barrier();
     /// let max_req = unsafe{array.max()}; //Safe in this instance as we have ensured no updates are currently happening
-    /// let max = array.block_on(max_req);
+    /// let max = array.block_on(max_req).expect("array len > 0");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1508,11 +1513,11 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// let num_pes = world.num_pes();
     /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
     /// let array_clone = array.clone();
-    /// unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2)}; //safe as we are accessing in a data parallel fashion
+    /// let _ = unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2).spawn()}; //safe as we are accessing in a data parallel fashion
     /// array.wait_all();
     /// array.barrier();
     /// let min_req = unsafe{array.min()}; //Safe in this instance as we have ensured no updates are currently happening
-    /// let min = array.block_on(min_req);
+    /// let min = array.block_on(min_req).expect("array len > 0");
     /// assert_eq!(0,min);
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
diff --git a/src/array/unsafe/iteration.rs b/src/array/unsafe/iteration.rs
index 42dffa05..05011453 100644
--- a/src/array/unsafe/iteration.rs
+++ b/src/array/unsafe/iteration.rs
@@ -140,7 +140,7 @@ impl<T: Dist> UnsafeArray<T> {
     ///
     /// unsafe {
     ///     if my_pe == 0 {
-    ///         for elem in onesided_iter!($array,array).into_iter() { //"into_iter()" converts into a standard Rust Iterator
+    ///         for elem in array.onesided_iter().into_iter() { //"into_iter()" converts into a standard Rust Iterator
     ///             println!("PE{my_pe} elem {elem}");
     ///         }
     ///     }
diff --git a/src/array/unsafe/local_chunks.rs b/src/array/unsafe/local_chunks.rs
index 8bbbf6fe..bcb4b8d4 100644
--- a/src/array/unsafe/local_chunks.rs
+++ b/src/array/unsafe/local_chunks.rs
@@ -184,9 +184,9 @@ impl<T: Dist> UnsafeArray<T> {
     /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    /// let _ = unsafe{array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
     ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    /// });
+    /// })}.spawn();
     /// array.wait_all();
     ///
     /// ```
@@ -210,9 +210,11 @@ impl<T: Dist> UnsafeArray<T> {
     /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
     ///
-    /// array.local_chunks_mut(5).await.enumerate().for_each(move|(i,chunk)| {
-    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    /// });
+    /// unsafe{
+    ///     let _ = array.local_chunks_mut(5).enumerate().for_each(move|(i,chunk)| {
+    ///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    ///     }).spawn();
+    /// }
     /// array.wait_all();
     ///
     /// ```
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 8279d46c..0baf9825 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -375,7 +375,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// let buf_len = buf.len();
     /// unsafe {
-    ///     array.dist_iter_mut().for_each(move |elem| *elem = buf_len); //we will used this val as completion detection
+    ///     let _ = array.dist_iter_mut().for_each(move |elem| *elem = buf_len).spawn(); //we will used this val as completion detection
     ///     for (i,elem) in buf.as_mut_slice()
     ///                          .expect("we just created it so we know its local")
     ///                          .iter_mut()
@@ -454,7 +454,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
-    ///     array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i);
+    ///     let _ = array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn();
     ///     for elem in buf.as_mut_slice()
     ///                          .expect("we just created it so we know its local") { //initialize mem_region
     ///         *elem = buf.len(); //we will used this val as completion detection
@@ -530,7 +530,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
-    ///     array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i); //we will used this val as completion detection
+    ///     let _ =array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn(); //we will used this val as completion detection
     ///     for elem in buf.as_mut_slice()
     ///                          .expect("we just created it so we know its local") { //initialize mem_region
     ///         *elem = buf.len();
@@ -596,7 +596,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
-    ///     array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i); //we will used this val as completion detection
+    ///     let _ = array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn(); //we will used this val as completion detection
     ///     for elem in buf.as_mut_slice()
     ///                          .expect("we just created it so we know its local") { //initialize mem_region
     ///         *elem = buf.len();
@@ -669,7 +669,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// let num_pes = world.num_pes();
     /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
     /// unsafe {
-    ///     array.dist_iter_mut().enumerate().for_each(move|(i,elem)| *elem = my_pe); //we will used this val as completion detection
+    ///     let _ = array.dist_iter_mut().enumerate().for_each(move|(i,elem)| *elem = my_pe).spawn(); //we will used this val as completion detection
     ///     array.wait_all();
     ///     array.barrier();
     ///     println!("PE{my_pe} array data: {:?}",unsafe{array.local_data()});

From aeccaa4284f14967679aa5f6d4ad329f08f63581 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 8 Oct 2024 15:22:50 -0700
Subject: [PATCH 094/116] fix lamellar doc tests

---
 src/active_messaging.rs    |  4 ++--
 src/darc.rs                |  4 ++--
 src/darc/global_rw_darc.rs |  8 ++++----
 src/darc/local_rw_darc.rs  |  8 ++++----
 src/lamellar_task_group.rs |  4 ++--
 src/lib.rs                 | 11 +++++------
 src/memregion/one_sided.rs | 10 +++++-----
 7 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index f46c0396..81a751d1 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -1122,7 +1122,7 @@ pub trait ActiveMessaging {
     /// # }
     /// #
     /// # let world = lamellar::LamellarWorldBuilder::new().build();
-    /// world.exec_am_all(MyAm{val: world.my_pe()});
+    /// let _ = world.exec_am_all(MyAm{val: world.my_pe()}).spawn();
     /// world.wait_all(); //block until the previous am has finished
     ///```
     fn wait_all(&self);
@@ -1158,7 +1158,7 @@ pub trait ActiveMessaging {
     /// # let world = lamellar::LamellarWorldBuilder::new().build();
     /// let world_clone = world.clone();
     /// world.block_on(async move {
-    ///     world_clone.exec_am_all(MyAm{val: world_clone.my_pe()});
+    ///     let _ = world_clone.exec_am_all(MyAm{val: world_clone.my_pe()}).spawn();
     ///     world_clone.await_all().await; //block until the previous am has finished
     /// });
     ///```
diff --git a/src/darc.rs b/src/darc.rs
index 6312e73c..740049f9 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -37,7 +37,7 @@
 //!     let my_pe = world.my_pe();
 //!     let num_pes = world.num_pes();
 //!     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).unwrap();
-//!     world.exec_am_all(DarcAm {counter: darc_counter.clone()});
+//!     let _ = world.exec_am_all(DarcAm {counter: darc_counter.clone()}).spawn();
 //!     darc_counter.fetch_add(my_pe, Ordering::SeqCst);
 //!     world.wait_all(); // wait for my active message to return
 //!     world.barrier(); //at this point all updates will have been performed
@@ -184,7 +184,7 @@ unsafe impl<T> Sync for DarcInner<T> {} //we cant create DarcInners without goin
 ///     let my_pe = world.my_pe();
 ///     let num_pes = world.num_pes();
 ///     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).unwrap();
-///     world.exec_am_all(DarcAm {counter: darc_counter.clone()});
+///     let _ = world.exec_am_all(DarcAm {counter: darc_counter.clone()}).spawn();
 ///     darc_counter.fetch_add(my_pe, Ordering::SeqCst);
 ///     world.wait_all(); // wait for my active message to return
 ///     world.barrier(); //at this point all updates will have been performed
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 15065a83..3f64dacb 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -557,7 +557,7 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// world.clone().block_on(async move {
     ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     ///     let guard = counter.read().await;
     ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///     drop(guard); //release the
@@ -630,7 +630,7 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// world.clone().block_on(async move {
     ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     ///     let mut guard = counter.write().await;
     ///     *guard += my_pe;
     ///     drop(guard); //release the
@@ -700,7 +700,7 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// world.clone().block_on(async move {
     ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     ///     let mut guard = counter.collective_write().await;
     ///     *guard += my_pe;
     ///     drop(guard); //release the lock
@@ -898,7 +898,7 @@ impl<T> GlobalRwDarc<T> {
     /// let my_pe = world.my_pe();
     ///
     /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
+    /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     /// let mut guard = counter.blocking_collective_write();
     /// *guard += my_pe;
     /// drop(guard); //release the lock
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 8892987e..78ca4da1 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -162,7 +162,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
+    /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     /// let guard = counter.blocking_read();
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
@@ -220,7 +220,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// world.clone().block_on(async move {
     ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
-    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     ///     let guard = counter.read().await;
     ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     /// });
@@ -268,7 +268,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
-    /// world.exec_am_all(DarcAm {counter: counter.clone()});
+    /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     /// let mut guard = counter.blocking_write();
     /// *guard += my_pe;
     ///```
@@ -328,7 +328,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// world.clone().block_on(async move{
     ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
-    ///     world.exec_am_all(DarcAm {counter: counter.clone()});
+    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     ///     let mut guard = counter.write().await;
     ///     *guard += my_pe;
     /// })
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index d3487f33..1d6a8e1e 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -553,9 +553,9 @@ impl<T: 'static> Future for TaskGroupLocalAmHandle<T> {
 /// };
 /// let task_group_1 = LamellarTaskGroup::new(&world); //associate the task group with the world
 /// let task_group_2 = LamellarTaskGroup::new(&even_pes); //we can also associate the task group with a team/sub_team
-/// task_group_1.exec_am_all(MyAm{world_pe,team_pe});
+/// let _ = task_group_1.exec_am_all(MyAm{world_pe,team_pe}).spawn();
 /// for pe in 0..even_pes.num_pes(){
-///    task_group_2.exec_am_pe(pe,MyAm{world_pe,team_pe});
+///    let _ = task_group_2.exec_am_pe(pe,MyAm{world_pe,team_pe}).spawn();
 /// }
 /// task_group_1.wait_all(); //only need to wait for active messages launched with task_group_1 to finish
 /// //do interesting work
diff --git a/src/lib.rs b/src/lib.rs
index a5c35b71..c50eb5a3 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -117,7 +117,7 @@
 //!     let num_pes = world.num_pes();
 //!     let am = HelloWorld { my_pe: my_pe };
 //!     for pe in 0..num_pes{
-//!         world.exec_am_pe(pe,am.clone()); // explicitly launch on each PE
+//!         let _ = world.exec_am_pe(pe,am.clone()).spawn(); // explicitly launch on each PE
 //!     }
 //!     world.wait_all(); // wait for all active messages to finish
 //!     world.barrier();  // synchronize with other PEs
@@ -135,11 +135,10 @@
 //!     let world = lamellar::LamellarWorldBuilder::new().build();
 //!     let my_pe = world.my_pe();
 //!     let block_array = AtomicArray::<usize>::new(&world, 1000, Distribution::Block); //we also support Cyclic distribution.
-//!     block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| elem.store(i)); //simultaneosuly initialize array accross all PEs, each pe only updates its local data
-//!     block_array.wait_all();
+//!     let _ =block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| elem.store(i)).block(); //simultaneosuly initialize array accross all PEs, each pe only updates its local data
 //!     block_array.barrier();
 //!     if my_pe == 0{
-//!         for (i,elem) in block_onesided_iter!($array,array).into_iter().enumerate(){ //iterate through entire array on pe 0 (automatically transfering remote data)
+//!         for (i,elem) in block_array.onesided_iter().into_iter().enumerate(){ //iterate through entire array on pe 0 (automatically transfering remote data)
 //!             println!("i: {} = {})",i,elem);
 //!         }
 //!     }
@@ -171,9 +170,9 @@
 //!     let num_pes = world.num_pes();
 //!     let cnt = Darc::new(&world, AtomicUsize::new(0)).expect("Current PE is in world team");
 //!     for pe in 0..num_pes{
-//!         world.exec_am_pe(pe,DarcAm{cnt: cnt.clone()}); // explicitly launch on each PE
+//!         let _ = world.exec_am_pe(pe,DarcAm{cnt: cnt.clone()}).spawn(); // explicitly launch on each PE
 //!     }
-//!     world.exec_am_all(DarcAm{cnt: cnt.clone()}); //also possible to execute on every PE with a single call
+//!     let _ = world.exec_am_all(DarcAm{cnt: cnt.clone()}).spawn(); //also possible to execute on every PE with a single call
 //!     cnt.fetch_add(1,Ordering::SeqCst); //this is valid as well!
 //!     world.wait_all(); // wait for all active messages to finish
 //!     world.barrier();  // synchronize with other PEs
diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index 1c4abdee..a669be87 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -443,7 +443,7 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
     /// let mem_region: OneSidedMemoryRegion<usize> = world.alloc_one_sided_mem_region(num_pes*10);
     /// unsafe{ for elem in mem_region.as_mut_slice().expect("PE just created the memregion"){*elem = num_pes};}
     ///
-    /// world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()});
+    /// let _ = world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()}).spawn();
     ///
     /// unsafe {
     ///     for (i,elem) in mem_region.iter().enumerate(){
@@ -503,7 +503,7 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
     /// let mem_region: OneSidedMemoryRegion<usize> = world.alloc_one_sided_mem_region(num_pes*10);
     /// unsafe{ for elem in mem_region.as_mut_slice().expect("PE just created the memregion "){*elem = num_pes};}
     ///
-    /// world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()});
+    /// let _ = world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()}).spawn();
     ///
     /// unsafe {
     ///     for (i,elem) in mem_region.iter().enumerate(){
@@ -571,7 +571,7 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
     /// let mem_region: OneSidedMemoryRegion<usize> = world.alloc_one_sided_mem_region(num_pes*10);
     /// unsafe{ for elem in mem_region.as_mut_slice().expect("PE just created the memregion"){*elem = num_pes};}
     ///
-    /// world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()});
+    /// let _ = world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()}).block();
     ///```
     pub unsafe fn get_unchecked<U: Into<LamellarMemoryRegion<T>>>(&self, index: usize, data: U) {
         MemoryRegionRDMA::<T>::get_unchecked(self, self.pe, index, data);
@@ -627,7 +627,7 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
     /// let mem_region: OneSidedMemoryRegion<usize> = world.alloc_one_sided_mem_region(num_pes*10);
     /// unsafe{ for elem in mem_region.as_mut_slice().expect("PE just created the memregion"){*elem = num_pes};}
     ///
-    /// world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()});
+    /// let _ = world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()}).block();
     ///```
     pub unsafe fn blocking_get<U: Into<LamellarMemoryRegion<T>>>(&self, index: usize, data: U) {
         MemoryRegionRDMA::<T>::blocking_get(self, self.pe, index, data);
@@ -689,7 +689,7 @@ impl<T: Dist> OneSidedMemoryRegion<T> {
     /// let mem_region: OneSidedMemoryRegion<usize> = world.alloc_one_sided_mem_region(num_pes*10);
     /// unsafe{ for elem in mem_region.as_mut_slice().expect("PE just created the memregion"){*elem = num_pes};}
     ///
-    /// world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()});
+    /// let _ = world.exec_am_all(MemRegionAm{mem_region: mem_region.clone()}).block();
     ///```
     pub fn data_local(&self) -> bool {
         if self.pe == self.mr.inner.my_id.1 {

From fd2e6c3e2cdd9a400826e58bd22f2446f027c21b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 11 Oct 2024 14:10:17 -0700
Subject: [PATCH 095/116] remove explict "blocking" apis for acquring Darc
 Locks, in favor of Handles

---
 src/array/global_lock_atomic.rs             | 497 ++++++++++----------
 src/array/local_lock_atomic.rs              | 447 +++++++++---------
 src/array/local_lock_atomic/iteration.rs    |   9 +-
 src/array/local_lock_atomic/local_chunks.rs |  28 +-
 src/darc.rs                                 |   2 +
 src/darc/global_rw_darc.rs                  | 379 +++------------
 src/darc/local_rw_darc.rs                   | 248 ++++------
 src/lamellae/rofi/rofi_comm.rs              |   6 +-
 src/lamellae/shmem/shmem_comm.rs            |   8 +-
 9 files changed, 667 insertions(+), 957 deletions(-)

diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 51800bb3..705941ed 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -1,3 +1,8 @@
+mod handle;
+use handle::{
+    GlobalLockCollectiveMutLocalDataHandle, GlobalLockLocalDataHandle,
+    GlobalLockMutLocalDataHandle, GlobalLockReadHandle, GlobalLockWriteHandle,
+};
 mod iteration;
 pub(crate) mod operations;
 mod rdma;
@@ -200,7 +205,7 @@ impl<T: Dist> GlobalLockLocalData<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.blocking_read_local_data();
+    /// let local_data = array.read_local_data.block();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
@@ -353,47 +358,47 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Return a global read lock guard on the calling PE
-    ///
-    /// this function will block the thread until the lock is acquired
-    /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::GlobalLockArray::read_lock) instead.
-    ///
-    /// # One-sided Operation
-    /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
-    ///
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let read_lock = array.blocking_read_lock();
-    /// //do interesting work
-    ///
-    ///```
-    pub fn blocking_read_lock(&self) -> GlobalLockReadGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: GlobalLockArray<T> = self.clone();
-        self.block_on(async move {
-            GlobalLockReadGuard {
-                array: self_clone.clone(),
-                lock_guard: self_clone.lock.read().await,
-            }
-        })
-    }
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return a global read lock guard on the calling PE
+    // ///
+    // /// this function will block the thread until the lock is acquired
+    // /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::GlobalLockArray::read_lock) instead.
+    // ///
+    // /// # One-sided Operation
+    // /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
+    // ///
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let read_lock = array.blocking_read_lock();
+    // /// //do interesting work
+    // ///
+    // ///```
+    // pub fn blocking_read_lock(&self) -> GlobalLockReadGuard<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: GlobalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         GlobalLockReadGuard {
+    //             array: self_clone.clone(),
+    //             lock_guard: self_clone.lock.read().await,
+    //         }
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return a global read lock guard on the calling PE
@@ -415,54 +420,51 @@ impl<T: Dist> GlobalLockArray<T> {
     ///     //do interesting work
     /// });
     ///```
-    pub async fn read_lock(&self) -> GlobalLockReadGuard<T> {
-        GlobalLockReadGuard {
-            array: self.clone(),
-            lock_guard: self.lock.read().await,
-        }
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Return a global write lock guard on the calling PE
-    ///
-    /// this function will block the thread until the lock is acquired
-    /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::GlobalLockArray::write_lock) instead.
-    ///
-    /// # One-sided Operation
-    /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
-    ///
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let write_lock = array.blocking_write_lock();
-    /// //do interesting work
-    ///
-    ///```
-    pub fn blocking_write_lock(&self) -> GlobalLockWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: GlobalLockArray<T> = self.clone();
-        self.block_on(async move {
-            GlobalLockWriteGuard {
-                array: self_clone.clone(),
-                lock_guard: self_clone.lock.write().await,
-            }
-        })
-    }
+    pub fn read_lock(&self) -> GlobalLockReadHandle<T> {
+        GlobalLockReadHandle::new(self.clone())
+    }
+
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return a global write lock guard on the calling PE
+    // ///
+    // /// this function will block the thread until the lock is acquired
+    // /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::GlobalLockArray::write_lock) instead.
+    // ///
+    // /// # One-sided Operation
+    // /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
+    // ///
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let write_lock = array.blocking_write_lock();
+    // /// //do interesting work
+    // ///
+    // ///```
+    // pub fn blocking_write_lock(&self) -> GlobalLockWriteGuard<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: GlobalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         GlobalLockWriteGuard {
+    //             array: self_clone.clone(),
+    //             lock_guard: self_clone.lock.write().await,
+    //         }
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return a global write lock guard on the calling PE
@@ -484,57 +486,54 @@ impl<T: Dist> GlobalLockArray<T> {
     ///     //do interesting work
     /// });
     ///```
-    pub async fn write_lock(&self) -> GlobalLockWriteGuard<T> {
-        GlobalLockWriteGuard {
-            array: self.clone(),
-            lock_guard: self.lock.write().await,
-        }
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
-    ///
-    /// Calling this function will result in a local read lock being captured on the array
-    ///
-    /// This function is blocking and intended to be called from non asynchronous contexts.
-    /// Calling within an asynchronous block may lead to deadlock.
-    ///
-    /// # One-sided Operation
-    /// Only returns local data on the calling PE
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let local_data = array.blocking_read_local_data();
-    /// println!("PE{my_pe} data: {local_data:?}");
-    ///```
-    pub fn blocking_read_local_data(&self) -> GlobalLockLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: GlobalLockArray<T> = self.clone();
-        self.block_on(async move {
-            GlobalLockLocalData {
-                array: self_clone.clone(),
-                start_index: 0,
-                end_index: self_clone.array.num_elems_local(),
-                // lock: self_clone.lock.clone(),
-                lock_guard: self_clone.lock.read().await,
-            }
-        })
-    }
+    pub fn write_lock(&self) -> GlobalLockWriteHandle<T> {
+        GlobalLockWriteHandle::new(self.clone())
+    }
+
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
+    // ///
+    // /// Calling this function will result in a local read lock being captured on the array
+    // ///
+    // /// This function is blocking and intended to be called from non asynchronous contexts.
+    // /// Calling within an asynchronous block may lead to deadlock.
+    // ///
+    // /// # One-sided Operation
+    // /// Only returns local data on the calling PE
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let local_data = array.read_local_data.block();
+    // /// println!("PE{my_pe} data: {local_data:?}");
+    // ///```
+    // pub fn blocking_read_local_data(&self) -> GlobalLockLocalData<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: GlobalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         GlobalLockLocalData {
+    //             array: self_clone.clone(),
+    //             start_index: 0,
+    //             end_index: self_clone.array.num_elems_local(),
+    //             // lock: self_clone.lock.clone(),
+    //             lock_guard: self_clone.lock.read().await,
+    //         }
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
@@ -558,62 +557,62 @@ impl<T: Dist> GlobalLockArray<T> {
     /// println!("PE{my_pe} data: {local_data:?}");
     /// });
     ///```
-    pub async fn read_local_data(&self) -> GlobalLockLocalData<T> {
-        GlobalLockLocalData {
+    pub fn read_local_data(&self) -> GlobalLockLocalDataHandle<T> {
+        GlobalLockLocalDataHandle {
             array: self.clone(),
             start_index: 0,
             end_index: self.array.num_elems_local(),
             // lock: self.lock.clone(),
-            lock_guard: self.lock.read().await,
+            lock_handle: self.lock.read(),
         }
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
-    ///
-    /// Calling this function will result in the global write lock being captured on the array.
-    ///.
-    /// This function is blocking and intended to be called from non asynchronous contexts.
-    /// Calling within an asynchronous block may lead to deadlock.
-    ///
-    /// # One-sided Operation
-    /// Only returns (mutable) local data on the calling PE
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let local_data = array.blocking_write_local_data();
-    /// println!("PE{my_pe} data: {local_data:?}");
-    ///```
-    pub fn blocking_write_local_data(&self) -> GlobalLockMutLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: GlobalLockArray<T> = self.clone();
-        self.block_on(async move {
-            let lock = self_clone.lock.write().await;
-            let data = GlobalLockMutLocalData {
-                array: self_clone.clone(),
-                start_index: 0,
-                end_index: self_clone.array.num_elems_local(),
-                lock_guard: lock,
-            };
-            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-            data
-        })
-    }
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
+    // ///
+    // /// Calling this function will result in the global write lock being captured on the array.
+    // ///.
+    // /// This function is blocking and intended to be called from non asynchronous contexts.
+    // /// Calling within an asynchronous block may lead to deadlock.
+    // ///
+    // /// # One-sided Operation
+    // /// Only returns (mutable) local data on the calling PE
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let local_data = array.blocking_write_local_data();
+    // /// println!("PE{my_pe} data: {local_data:?}");
+    // ///```
+    // pub fn blocking_write_local_data(&self) -> GlobalLockMutLocalData<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: GlobalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         let lock = self_clone.lock.write().await;
+    //         let data = GlobalLockMutLocalData {
+    //             array: self_clone.clone(),
+    //             start_index: 0,
+    //             end_index: self_clone.array.num_elems_local(),
+    //             lock_guard: lock,
+    //         };
+    //         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+    //         data
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
@@ -637,63 +636,60 @@ impl<T: Dist> GlobalLockArray<T> {
     ///     println!("PE{my_pe} data: {local_data:?}");
     /// });
     ///```
-    pub async fn write_local_data(&self) -> GlobalLockMutLocalData<T> {
-        let lock = self.lock.write().await;
-        let data = GlobalLockMutLocalData {
+    pub fn write_local_data(&self) -> GlobalLockMutLocalDataHandle<T> {
+        GlobalLockMutLocalDataHandle {
             array: self.clone(),
             start_index: 0,
             end_index: self.array.num_elems_local(),
-            lock_guard: lock,
-        };
-        // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-        data
-    }
-
-    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
-    ///
-    /// Calling this function will result in the collective write lock being captured on the array
-    ///
-    /// # Collective Operation
-    /// All PEs associated with this array must enter the call, otherwise deadlock will occur.
-    /// Upon return every PE will hold a special collective write lock so that they can all access their local data simultaneous
-    /// This lock prevents any other access from occuring on the array until it is dropped on all the PEs.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let local_data = array.blocking_collective_write_local_data();
-    /// println!("PE{my_pe} data: {local_data:?}");
-    ///```
-    pub fn blocking_collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
+            lock_handle: self.lock.write(),
         }
-        let self_clone: GlobalLockArray<T> = self.clone();
-        self.block_on(async move {
-            let lock = self_clone.lock.collective_write().await;
-            let data = GlobalLockCollectiveMutLocalData {
-                array: self_clone.clone(),
-                start_index: 0,
-                end_index: self_clone.array.num_elems_local(),
-                _lock_guard: lock,
-            };
-            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-            data
-        })
     }
 
+    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
+    // ///
+    // /// Calling this function will result in the collective write lock being captured on the array
+    // ///
+    // /// # Collective Operation
+    // /// All PEs associated with this array must enter the call, otherwise deadlock will occur.
+    // /// Upon return every PE will hold a special collective write lock so that they can all access their local data simultaneous
+    // /// This lock prevents any other access from occuring on the array until it is dropped on all the PEs.
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let local_data = array.blocking_collective_write_local_data();
+    // /// println!("PE{my_pe} data: {local_data:?}");
+    // ///```
+    // pub fn blocking_collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: GlobalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         let lock = self_clone.lock.collective_write().await;
+    //         let data = GlobalLockCollectiveMutLocalData {
+    //             array: self_clone.clone(),
+    //             start_index: 0,
+    //             end_index: self_clone.array.num_elems_local(),
+    //             _lock_guard: lock,
+    //         };
+    //         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+    //         data
+    //     })
+    // }
+
     #[doc(alias("Collective"))]
     /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
     ///
@@ -716,16 +712,13 @@ impl<T: Dist> GlobalLockArray<T> {
     ///    println!("PE{my_pe} data: {local_data:?}");
     /// });
     ///```
-    pub async fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
-        let lock = self.lock.collective_write().await;
-        let data = GlobalLockCollectiveMutLocalData {
+    pub fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalDataHandle<T> {
+        GlobalLockCollectiveMutLocalDataHandle {
             array: self.clone(),
             start_index: 0,
             end_index: self.array.num_elems_local(),
-            _lock_guard: lock,
-        };
-        // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-        data
+            lock_handle: self.lock.collective_write(),
+        }
     }
 
     #[doc(hidden)]
@@ -766,7 +759,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data.block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_unsafe" call
     /// // but array1 will not be dropped until after 'slice' is dropped.
@@ -814,7 +807,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data.block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -858,7 +851,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data.block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -904,7 +897,7 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data.block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_atomic" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 8dbce07c..ddeb6bbe 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -1,6 +1,11 @@
 mod iteration;
 pub(crate) mod local_chunks;
 pub use local_chunks::{LocalLockLocalChunks, LocalLockLocalChunksMut};
+mod handle;
+use handle::{
+    LocalLockLocalDataHandle, LocalLockMutLocalDataHandle, LocalLockReadHandle,
+    LocalLockWriteHandle,
+};
 pub(crate) mod operations;
 mod rdma;
 use crate::array::private::ArrayExecAm;
@@ -8,8 +13,8 @@ use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
 use crate::barrier::BarrierHandle;
-use crate::config;
-use crate::darc::local_rw_darc::LocalRwDarc;
+use crate::darc::local_rw_darc::LocalRwDarcWriteGuard;
+use crate::darc::local_rw_darc::{LocalRwDarc, LocalRwDarcReadGuard};
 use crate::darc::DarcMode;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
@@ -20,7 +25,6 @@ use crate::scheduler::LamellarTask;
 //     lock_api::{ArcRwLockReadGuard, ArcRwLockWriteGuard},
 //     RawRwLock,
 // };
-use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
 use pin_project::pin_project;
 
 use std::ops::{Deref, DerefMut};
@@ -88,7 +92,7 @@ pub struct LocalLockMutLocalData<T: Dist> {
     array: LocalLockArray<T>,
     start_index: usize,
     end_index: usize,
-    lock_guard: RwLockWriteGuardArc<()>,
+    lock_guard: LocalRwDarcWriteGuard<()>,
 }
 
 // impl<T: Dist> Drop for LocalLockMutLocalData<T> {
@@ -120,13 +124,12 @@ impl<T: Dist> DerefMut for LocalLockMutLocalData<T> {
 #[derive(Debug)]
 pub struct LocalLockLocalData<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
-    // lock: LocalRwDarc<()>,
     start_index: usize,
     end_index: usize,
-    lock_guard: Arc<RwLockReadGuardArc<()>>,
+    lock_guard: Arc<LocalRwDarcReadGuard<()>>,
 }
 
-impl<'a, T: Dist> Clone for LocalLockLocalData<T> {
+impl<T: Dist> Clone for LocalLockLocalData<T> {
     fn clone(&self) -> Self {
         // println!("getting read lock in LocalLockLocalData clone");
         LocalLockLocalData {
@@ -154,7 +157,7 @@ impl<'a, T: Dist> Clone for LocalLockLocalData<T> {
 //     }
 // }
 
-impl<'a, T: Dist> LocalLockLocalData<T> {
+impl<T: Dist> LocalLockLocalData<T> {
     /// Convert into a smaller sub range of the local data, the original read lock is transfered to the new sub data to mainitain safety guarantees
     ///
     /// # Examples
@@ -166,7 +169,7 @@ impl<'a, T: Dist> LocalLockLocalData<T> {
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.blocking_read_local_data();
+    /// let local_data = array.read_local_data().block();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
@@ -182,7 +185,21 @@ impl<'a, T: Dist> LocalLockLocalData<T> {
     }
 }
 
-impl<'a, T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<T> {
+// impl<T: Dist> LocalLockGuard<T> for LocalLockLocalData<T> {
+//     type Guard = LocalRwDarcReadGuard<()>;
+//     fn new(array: LocalLockArray<T>, lock_guard: Self::Guard) -> Self {
+//         let end_index = array.num_elems_local();
+//         LocalLockLocalData {
+//             array,
+//             start_index: 0,
+//             end_index,
+//             // lock: self.lock.clone(),
+//             lock_guard: Arc::new(lock_guard),
+//         }
+//     }
+// }
+
+impl<T: Dist + serde::Serialize> serde::Serialize for LocalLockLocalData<T> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
@@ -234,7 +251,7 @@ impl<T: Dist> Deref for LocalLockLocalData<T> {
 #[derive(Clone)]
 pub struct LocalLockReadGuard<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
-    lock_guard: Arc<RwLockReadGuardArc<()>>,
+    lock_guard: Arc<LocalRwDarcReadGuard<()>>,
 }
 
 impl<T: Dist> LocalLockReadGuard<T> {
@@ -250,10 +267,21 @@ impl<T: Dist> LocalLockReadGuard<T> {
     }
 }
 
+// impl<T: Dist> LocalLockGuard<T> for LocalLockReadGuard<T> {
+//     type Guard = LocalRwDarcReadGuard<()>;
+//     fn new(array: LocalLockArray<T>, lock_guard: Self::Guard) -> Self {
+//         LocalLockReadGuard {
+//             array,
+//             lock_guard: Arc::new(lock_guard),
+//         }
+//     }
+// }
+
 /// Captures a write lock on the array, allowing mutable access to the underlying data
+// #[derive(Clone)]
 pub struct LocalLockWriteGuard<T: Dist> {
     pub(crate) array: LocalLockArray<T>,
-    lock_guard: RwLockWriteGuardArc<()>,
+    lock_guard: LocalRwDarcWriteGuard<()>,
 }
 
 impl<T: Dist> From<LocalLockMutLocalData<T>> for LocalLockWriteGuard<T> {
@@ -329,46 +357,46 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local read lock guard
-    ///
-    /// this function will block the thread until the lock is acquired
-    /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::LocalLockArray::read_lock) instead.
-    ///
-    /// # One-sided Operation
-    /// Only explictly requires the calling PE
-    ///
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let read_lock = array.blocking_read_lock();
-    /// //do interesting work
-    ///
-    pub fn blocking_read_lock(&self) -> LocalLockReadGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: LocalLockArray<T> = self.clone();
-        self.block_on(async move {
-            LocalLockReadGuard {
-                array: self_clone.clone(),
-                lock_guard: Arc::new(self_clone.lock.read().await),
-            }
-        })
-    }
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return the calling PE's local read lock guard
+    // ///
+    // /// this function will block the thread until the lock is acquired
+    // /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::LocalLockArray::read_lock) instead.
+    // ///
+    // /// # One-sided Operation
+    // /// Only explictly requires the calling PE
+    // ///
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let read_lock = array.read_lock().block();
+    // /// //do interesting work
+    // ///
+    // pub fn blocking_read_lock(&self) -> LocalLockReadGuard<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: LocalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         LocalLockReadGuard {
+    //             array: self_clone.clone(),
+    //             lock_guard: Arc::new(self_clone.lock.read().await),
+    //         }
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local read lock
@@ -390,53 +418,50 @@ impl<T: Dist> LocalLockArray<T> {
     ///     //do interesting work
     /// });
     ///```
-    pub async fn read_lock(&self) -> LocalLockReadGuard<T> {
-        LocalLockReadGuard {
-            array: self.clone(),
-            lock_guard: Arc::new(self.lock.read().await),
-        }
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local write lock guard
-    ///
-    /// this function will block the thread until the lock is acquired
-    /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::LocalLockArray::write_lock) instead.
-    ///
-    /// # One-sided Operation
-    /// Only explictly requires the calling PE
-    ///
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let write_lock = array.blocking_write_lock();
-    /// //do interesting work
-    ///
-    pub fn blocking_write_lock(&self) -> LocalLockWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: LocalLockArray<T> = self.clone();
-        self.block_on(async move {
-            LocalLockWriteGuard {
-                array: self_clone.clone(),
-                lock_guard: self_clone.lock.write().await,
-            }
-        })
-    }
+    pub fn read_lock(&self) -> LocalLockReadHandle<T> {
+        LocalLockReadHandle::new(self.clone())
+    }
+
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return the calling PE's local write lock guard
+    // ///
+    // /// this function will block the thread until the lock is acquired
+    // /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::LocalLockArray::write_lock) instead.
+    // ///
+    // /// # One-sided Operation
+    // /// Only explictly requires the calling PE
+    // ///
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let write_lock = array.blocking_write_lock();
+    // /// //do interesting work
+    // ///
+    // pub fn blocking_write_lock(&self) -> LocalLockWriteGuard<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: LocalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         LocalLockWriteGuard {
+    //             array: self_clone.clone(),
+    //             lock_guard: self_clone.lock.write().await,
+    //         }
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the calling PE's local write lock
@@ -452,61 +477,61 @@ impl<T: Dist> LocalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// world.block_on(async move {
-    ///     let write_lock = array.write_lock().await;
+    /// let array_clone = array.clone();
+    /// world.spawn(async move {
+    ///     let write_lock = array_clone.write_lock().await;
     ///     //do interesting work
     /// });
+    /// array.write_lock().block();
+    /// //do interesting work
     ///```
-    pub async fn write_lock(&self) -> LocalLockWriteGuard<T> {
-        LocalLockWriteGuard {
-            array: self.clone(),
-            lock_guard: self.lock.write().await,
-        }
-    }
+    pub fn write_lock(&self) -> LocalLockWriteHandle<T> {
+        LocalLockWriteHandle::new(self.clone())
+    }
+
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
+    // ///
+    // /// Calling this function will result in a local read lock being captured on the array
+    // ///
+    // /// # One-sided Operation
+    // /// Only returns local data on the calling PE
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let local_data = array.read_local_data().block();
+    // /// println!("PE{my_pe} data: {local_data:?}");
+    // ///```
+    // pub fn blocking_read_local_data(&self) -> LocalLockLocalData<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: LocalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         LocalLockLocalData {
+    //             array: self_clone.clone(),
+    //             // lock: self_clone.lock.clone(),
+    //             start_index: 0,
+    //             end_index: self_clone.num_elems_local(),
+    //             lock_guard: Arc::new(self_clone.lock.read().await),
+    //         }
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    ///
-    /// Calling this function will result in a local read lock being captured on the array
-    ///
-    /// # One-sided Operation
-    /// Only returns local data on the calling PE
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let local_data = array.blocking_read_local_data();
-    /// println!("PE{my_pe} data: {local_data:?}");
-    ///```
-    pub fn blocking_read_local_data(&self) -> LocalLockLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: LocalLockArray<T> = self.clone();
-        self.block_on(async move {
-            LocalLockLocalData {
-                array: self_clone.clone(),
-                // lock: self_clone.lock.clone(),
-                start_index: 0,
-                end_index: self_clone.num_elems_local(),
-                lock_guard: Arc::new(self_clone.lock.read().await),
-            }
-        })
-    }
-
     /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
     ///
     /// Calling this function will result in a local read lock being captured on the array
@@ -526,60 +551,58 @@ impl<T: Dist> LocalLockArray<T> {
     ///     println!("PE{my_pe} data: {local_data:?}");
     /// });
     ///```
-    pub async fn read_local_data(&self) -> LocalLockLocalData<T> {
-        // println!("getting read lock in read_local_local");
-        LocalLockLocalData {
+    pub fn read_local_data(&self) -> LocalLockLocalDataHandle<T> {
+        LocalLockLocalDataHandle {
             array: self.clone(),
-            // lock: self.lock.clone(),
             start_index: 0,
             end_index: self.num_elems_local(),
-            lock_guard: Arc::new(self.lock.read().await),
+            lock_handle: self.lock.read(),
         }
     }
 
-    #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
-    ///
-    /// Calling this function will result in the local write lock being captured on the array
-    ///
-    /// # One-sided Operation
-    /// Only returns (mutable) local data on the calling PE
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// let local_data = array.blocking_write_local_data();
-    /// println!("PE{my_pe} data: {local_data:?}");
-    ///```
-    pub fn blocking_write_local_data(&self) -> LocalLockMutLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-        }
-        let self_clone: LocalLockArray<T> = self.clone();
-        self.block_on(async move {
-            let lock = self_clone.lock.write().await;
-            let data = LocalLockMutLocalData {
-                array: self_clone.clone(),
-                start_index: 0,
-                end_index: self_clone.num_elems_local(),
-                lock_guard: lock,
-            };
-            // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-            data
-        })
-    }
+    // #[doc(alias("One-sided", "onesided"))]
+    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
+    // ///
+    // /// Calling this function will result in the local write lock being captured on the array
+    // ///
+    // /// # One-sided Operation
+    // /// Only returns (mutable) local data on the calling PE
+    // ///
+    // /// # Examples
+    // ///```
+    // /// use lamellar::array::prelude::*;
+    // /// let world = LamellarWorldBuilder::new().build();
+    // /// let my_pe = world.my_pe();
+    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    // ///
+    // /// let local_data = array.blocking_write_local_data();
+    // /// println!("PE{my_pe} data: {local_data:?}");
+    // ///```
+    // pub fn blocking_write_local_data(&self) -> LocalLockMutLocalData<T> {
+    //     if std::thread::current().id() != *crate::MAIN_THREAD {
+    //         if let Some(val) = config().blocking_call_warning {
+    //             if val {
+    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
+    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //             }
+    //         } else {
+    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
+    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
+    //         }
+    //     }
+    //     let self_clone: LocalLockArray<T> = self.clone();
+    //     self.block_on(async move {
+    //         let lock = self_clone.lock.write().await;
+    //         let data = LocalLockMutLocalData {
+    //             array: self_clone.clone(),
+    //             start_index: 0,
+    //             end_index: self_clone.num_elems_local(),
+    //             lock_guard: lock,
+    //         };
+    //         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
+    //         data
+    //     })
+    // }
 
     #[doc(alias("One-sided", "onesided"))]
     /// TODO: UPDATE
@@ -602,17 +625,13 @@ impl<T: Dist> LocalLockArray<T> {
     ///     println!("PE{my_pe} data: {local_data:?}");
     /// });
     ///```
-    pub async fn write_local_data(&self) -> LocalLockMutLocalData<T> {
-        // println!("getting write lock in write_local_data");
-        let lock = self.lock.write().await;
-        let data = LocalLockMutLocalData {
+    pub fn write_local_data(&self) -> LocalLockMutLocalDataHandle<T> {
+        LocalLockMutLocalDataHandle {
             array: self.clone(),
             start_index: 0,
             end_index: self.num_elems_local(),
-            lock_guard: lock,
-        };
-        // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-        data
+            lock_handle: self.lock.write(),
+        }
     }
 
     #[doc(hidden)]
@@ -653,7 +672,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_unsafe" call
     /// // but array1 will not be dropped until after 'slice' is dropped.
@@ -701,7 +720,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -745,7 +764,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_global_lock" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -791,7 +810,7 @@ impl<T: Dist + 'static> LocalLockArray<T> {
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.blocking_read_local_data();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_atomic" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -1179,7 +1198,7 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.reduce("prod"));
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1214,7 +1233,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let sum = array.block_on(read_guard.sum());
     /// ```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1244,7 +1263,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i+1));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.prod()).expect("array len > 0");
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
@@ -1276,7 +1295,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let max = array.block_on(read_guard.max()).expect("array len > 0");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
@@ -1307,7 +1326,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// let num_pes = world.num_pes();
     /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let min = array.block_on(read_guard.min()).expect("array len > 0");
     /// assert_eq!(0,min);
     ///```
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 6e0c6bbf..918e2323 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -6,6 +6,7 @@ use crate::array::local_lock_atomic::*;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
+use crate::darc::local_rw_darc::LocalRwDarcWriteGuard;
 use crate::memregion::Dist;
 // use parking_lot::{
 //     lock_api::{ArcRwLockReadGuard, ArcRwLockWriteGuard},
@@ -23,7 +24,7 @@ impl<T> InnerArray for LocalLockArray<T> {
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<()>>,
+    lock: Arc<LocalRwDarcReadGuard<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -57,7 +58,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIter<'a, T> {
 #[derive(Clone)]
 pub struct LocalLockLocalIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockReadGuardArc<()>>,
+    lock: Arc<LocalRwDarcReadGuard<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -184,7 +185,7 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalIter<'static, T>
 
 pub struct LocalLockDistIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<()>>,
+    lock: Arc<LocalRwDarcWriteGuard<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
@@ -216,7 +217,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIterMut<'a, T> {
 
 pub struct LocalLockLocalIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<RwLockWriteGuardArc<()>>,
+    lock: Arc<LocalRwDarcWriteGuard<()>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index a8b1739a..dfdeb759 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -3,6 +3,7 @@ use crate::array::iterator::private::*;
 use crate::array::local_lock_atomic::*;
 use crate::array::LamellarArray;
 use crate::config;
+use crate::darc::local_rw_darc::{LocalRwDarcReadGuard, LocalRwDarcWriteGuard};
 use crate::memregion::Dist;
 
 use std::sync::Arc;
@@ -15,8 +16,9 @@ pub struct LocalLockLocalChunks<T: Dist> {
     index: usize,     //global index within the array local data
     end_index: usize, //global index within the array local data
     array: LocalLockArray<T>,
-    lock: LocalRwDarc<()>,
-    lock_guard: Arc<RwLockReadGuardArc<()>>,
+    // lock: LocalRwDarc<()>,
+    // lock_guard: Arc<RwLockReadGuardArc<()>>,
+    lock_guard: Arc<LocalRwDarcReadGuard<()>>,
 }
 
 impl<T: Dist> IterClone for LocalLockLocalChunks<T> {
@@ -26,7 +28,7 @@ impl<T: Dist> IterClone for LocalLockLocalChunks<T> {
             index: self.index,
             end_index: self.end_index,
             array: self.array.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
@@ -40,8 +42,8 @@ pub struct LocalLockLocalChunksMut<T: Dist> {
     index: usize,     //global index within the array local data
     end_index: usize, //global index within the array local data
     array: LocalLockArray<T>,
-    lock: LocalRwDarc<()>,
-    lock_guard: Arc<RwLockWriteGuardArc<()>>,
+    // lock: LocalRwDarc<()>,
+    lock_guard: Arc<LocalRwDarcWriteGuard<()>>,
 }
 
 impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
@@ -51,7 +53,7 @@ impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
             index: self.index,
             end_index: self.end_index,
             array: self.array.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
@@ -61,7 +63,7 @@ impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
 pub struct LocalLockMutChunkLocalData<'a, T: Dist> {
     data: &'a mut [T],
     _index: usize,
-    _lock_guard: Arc<RwLockWriteGuardArc<()>>,
+    _lock_guard: Arc<LocalRwDarcWriteGuard<()>>,
 }
 
 impl<T: Dist> Deref for LocalLockMutChunkLocalData<'_, T> {
@@ -95,7 +97,7 @@ impl<T: Dist> LocalIterator for LocalLockLocalChunks<T> {
             index: new_start_i,
             end_index: end_i,
             array: self.array.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
@@ -160,7 +162,7 @@ impl<T: Dist + 'static> LocalIterator for LocalLockLocalChunksMut<T> {
             index: new_start_i,
             end_index: end_i,
             array: self.array.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: self.lock_guard.clone(),
         }
     }
@@ -241,7 +243,7 @@ impl<T: Dist> LocalLockArray<T> {
             index: 0,
             end_index: 0,
             array: self.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: lock,
         }
     }
@@ -281,7 +283,7 @@ impl<T: Dist> LocalLockArray<T> {
             index: 0,
             end_index: 0,
             array: self.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: lock,
         }
     }
@@ -311,7 +313,7 @@ impl<T: Dist> LocalLockArray<T> {
             index: 0,
             end_index: 0,
             array: self.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: lock,
         }
     }
@@ -352,7 +354,7 @@ impl<T: Dist> LocalLockArray<T> {
             index: 0,
             end_index: 0,
             array: self.clone(),
-            lock: self.lock.clone(),
+            // lock: self.lock.clone(),
             lock_guard: lock,
         }
     }
diff --git a/src/darc.rs b/src/darc.rs
index 740049f9..975decb7 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -78,6 +78,8 @@ pub(crate) mod global_rw_darc;
 use global_rw_darc::DistRwLock;
 pub use global_rw_darc::GlobalRwDarc;
 
+pub(crate) mod handle;
+
 static DARC_ID: AtomicUsize = AtomicUsize::new(0);
 
 #[repr(u8)]
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 3f64dacb..45183b0a 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -16,6 +16,10 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
+use super::handle::{
+    GlobalRwDarcCollectiveWriteHandle, GlobalRwDarcReadHandle, GlobalRwDarcWriteHandle,
+};
+
 #[derive(serde::Serialize, serde::Deserialize, Debug)]
 enum LockType {
     Read,
@@ -255,15 +259,15 @@ impl LamellarAM for UnlockAm {
 }
 
 pub struct GlobalRwDarcReadGuard<T: 'static> {
-    rwlock: Darc<DistRwLock<T>>,
-    marker: PhantomData<&'static mut T>,
-    local_cnt: Arc<AtomicUsize>, //this allows us to immediately clone the read guard without launching an AM, and will prevent dropping the global guard until local copies are gone
+    pub(crate) darc: GlobalRwDarc<T>,
+    pub(crate) marker: PhantomData<&'static mut T>,
+    pub(crate) local_cnt: Arc<AtomicUsize>, //this allows us to immediately clone the read guard without launching an AM, and will prevent dropping the global guard until local copies are gone
 }
 
 impl<T> Deref for GlobalRwDarcReadGuard<T> {
     type Target = T;
     fn deref(&self) -> &T {
-        unsafe { &*self.rwlock.data.get() }
+        unsafe { &*self.darc.darc.data.get() }
     }
 }
 
@@ -271,7 +275,7 @@ impl<T> Clone for GlobalRwDarcReadGuard<T> {
     fn clone(&self) -> Self {
         self.local_cnt.fetch_add(1, Ordering::SeqCst);
         GlobalRwDarcReadGuard {
-            rwlock: self.rwlock.clone(),
+            darc: self.darc.clone(),
             marker: PhantomData,
             local_cnt: self.local_cnt.clone(),
         }
@@ -282,7 +286,7 @@ impl<T> Drop for GlobalRwDarcReadGuard<T> {
     fn drop(&mut self) {
         // println!("dropping global rwdarc read guard");
         if self.local_cnt.fetch_sub(1, Ordering::SeqCst) == 1 {
-            let inner = self.rwlock.inner();
+            let inner = self.darc.inner();
             let team = inner.team();
             let remote_rwlock_addr = team.lamellae.remote_addr(
                 0,
@@ -306,34 +310,34 @@ impl<T> Drop for GlobalRwDarcReadGuard<T> {
 //TODO update this so that we print locked if data is locked...
 impl<T: fmt::Debug> fmt::Debug for GlobalRwDarcReadGuard<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        unsafe { fmt::Debug::fmt(&self.rwlock.data.get().as_ref(), f) }
+        unsafe { fmt::Debug::fmt(&self.darc.darc.data.get().as_ref(), f) }
     }
 }
 
 pub struct GlobalRwDarcWriteGuard<T: 'static> {
-    rwlock: Darc<DistRwLock<T>>,
-    marker: PhantomData<&'static mut T>,
+    pub(crate) darc: GlobalRwDarc<T>,
+    pub(crate) marker: PhantomData<&'static mut T>,
 }
 
 impl<T> Deref for GlobalRwDarcWriteGuard<T> {
     type Target = T;
     #[inline]
     fn deref(&self) -> &T {
-        unsafe { &*self.rwlock.data.get() }
+        unsafe { &*self.darc.darc.data.get() }
     }
 }
 
 impl<T> DerefMut for GlobalRwDarcWriteGuard<T> {
     #[inline]
     fn deref_mut(&mut self) -> &mut T {
-        unsafe { &mut *self.rwlock.data.get() }
+        unsafe { &mut *self.darc.darc.data.get() }
     }
 }
 
 impl<T> Drop for GlobalRwDarcWriteGuard<T> {
     fn drop(&mut self) {
         // println!("dropping write guard");
-        let inner = self.rwlock.inner();
+        let inner = self.darc.inner();
         let team = inner.team();
         let remote_rwlock_addr = team.lamellae.remote_addr(
             0,
@@ -355,35 +359,35 @@ impl<T> Drop for GlobalRwDarcWriteGuard<T> {
 
 impl<T: fmt::Debug> fmt::Debug for GlobalRwDarcWriteGuard<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        unsafe { fmt::Debug::fmt(&self.rwlock.data.get().as_ref(), f) }
+        unsafe { fmt::Debug::fmt(&self.darc.darc.data.get().as_ref(), f) }
     }
 }
 
 pub struct GlobalRwDarcCollectiveWriteGuard<T: 'static> {
-    rwlock: Darc<DistRwLock<T>>,
-    collective_cnt: usize,
-    marker: PhantomData<&'static mut T>,
+    pub(crate) darc: GlobalRwDarc<T>,
+    pub(crate) collective_cnt: usize,
+    pub(crate) marker: PhantomData<&'static mut T>,
 }
 
 impl<T> Deref for GlobalRwDarcCollectiveWriteGuard<T> {
     type Target = T;
     #[inline]
     fn deref(&self) -> &T {
-        unsafe { &*self.rwlock.data.get() }
+        unsafe { &*self.darc.darc.data.get() }
     }
 }
 
 impl<T> DerefMut for GlobalRwDarcCollectiveWriteGuard<T> {
     #[inline]
     fn deref_mut(&mut self) -> &mut T {
-        unsafe { &mut *self.rwlock.data.get() }
+        unsafe { &mut *self.darc.darc.data.get() }
     }
 }
 
 impl<T> Drop for GlobalRwDarcCollectiveWriteGuard<T> {
     fn drop(&mut self) {
         // println!("dropping collective write guard");
-        let inner = self.rwlock.inner();
+        let inner = self.darc.inner();
         let team = inner.team();
         let remote_rwlock_addr = team.lamellae.remote_addr(
             0,
@@ -405,7 +409,7 @@ impl<T> Drop for GlobalRwDarcCollectiveWriteGuard<T> {
 
 impl<T: fmt::Debug> fmt::Debug for GlobalRwDarcCollectiveWriteGuard<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        unsafe { fmt::Debug::fmt(&self.rwlock.data.get().as_ref(), f) }
+        unsafe { fmt::Debug::fmt(&self.darc.darc.data.get().as_ref(), f) }
     }
 }
 
@@ -519,13 +523,11 @@ impl<T> GlobalRwDarc<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
-    ///
-    /// The current task will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while any writer currently has access to the lock, but there may be other readers
+    /// Launches an active message to gather a global read lock associated with this GlobalRwDarc returning a handle representing this operation.
+    /// The returned handle must either be await'd `.read().await` within an async context
+    /// or it must be blocked on `.read().block()` in a non async context to actually acquire the lock
     ///
-    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// After awaiting or blocking on the handle, a RAII guard is returned which will drop the read access of the wrlock when dropped
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
@@ -554,18 +556,16 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
+    /// let guard = counter.read().block();
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    /// drop(guard); //release the lock
+    /// world.wait_all(); // wait for my active message to return
+    /// world.barrier(); //at this point all updates will have been performed
     ///
-    /// world.clone().block_on(async move {
-    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    ///     let guard = counter.read().await;
-    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    ///     drop(guard); //release the
-    ///     world.wait_all(); // wait for my active message to return
-    ///     world.barrier(); //at this point all updates will have been performed
-    /// });
     ///```
-    pub async fn read(&self) -> GlobalRwDarcReadGuard<T> {
+    pub fn read(&self) -> GlobalRwDarcReadHandle<T> {
         // println!("async read");
         let inner = self.inner();
         let team = inner.team();
@@ -573,7 +573,7 @@ impl<T> GlobalRwDarc<T> {
             0,
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        team.exec_am_pe_tg(
+        let am = team.exec_am_pe_tg(
             0,
             LockAm {
                 rwlock_addr: remote_rwlock_addr,
@@ -581,25 +581,19 @@ impl<T> GlobalRwDarc<T> {
                 lock_type: LockType::Read,
             },
             Some(inner.am_counters()),
-        )
-        .await;
-        // println!("TID: {:?} async got read lock", std::thread::current().id());
-        GlobalRwDarcReadGuard {
-            rwlock: self.darc.clone(),
-            marker: PhantomData,
-            local_cnt: Arc::new(AtomicUsize::new(1)),
+        );
+        GlobalRwDarcReadHandle {
+            darc: self.clone(),
+            lock_am: am,
         }
-        // inner.item().read(remote_rwlock_addr)
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Launches an active message to gather the global write lock associated with this GlobalRwDarc.
-    ///
-    /// The current task will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while another writer or any readers currently have access to the lock
+    /// Launches an active message to gather a global write lock associated with this GlobalRwDarc returning a handle representing this operation.
+    /// The returned handle must either be await'd `.write().await` within an async context
+    /// or it must be blocked on `.write().block()` in a non async context to actually acquire the lock
     ///
-    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// After awaiting or blocking on the handle, a RAII guard is returned which will drop the write access of the wrlock when dropped
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
@@ -628,17 +622,15 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// world.clone().block_on(async move {
-    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    ///     let mut guard = counter.write().await;
-    ///     *guard += my_pe;
-    ///     drop(guard); //release the
-    ///     world.await_all().await; // wait for my active message to return
-    ///     world.async_barrier().await; //at this point all updates will have been performed
-    /// });
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
+    /// let mut guard = counter.write().block();  //block until we get the write lock
+    /// *guard += my_pe;
+    /// drop(guard); //release the
+    /// world.wait_all(); // wait for my active message to return
+    /// world.barrier(); //at this point all updates will have been performed
     ///```
-    pub async fn write(&self) -> GlobalRwDarcWriteGuard<T> {
+    pub fn write(&self) -> GlobalRwDarcWriteHandle<T> {
         // println!("async write");
         let inner = self.inner();
         let team = inner.team();
@@ -647,7 +639,7 @@ impl<T> GlobalRwDarc<T> {
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
 
-        team.exec_am_pe_tg(
+        let am = team.exec_am_pe_tg(
             0,
             LockAm {
                 rwlock_addr: remote_rwlock_addr,
@@ -655,221 +647,19 @@ impl<T> GlobalRwDarc<T> {
                 lock_type: LockType::Write,
             },
             Some(inner.am_counters()),
-        )
-        .await;
-        GlobalRwDarcWriteGuard {
-            rwlock: self.darc.clone(),
-            marker: PhantomData,
-        }
-    }
-
-    #[doc(alias("Collective"))]
-    /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
-    ///
-    /// The current task will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while another writer or any readers currently have access to the lock
-    ///
-    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    ///
-    /// # Collective Operation
-    /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
-    ///
-    /// # Examples
-    ///
-    ///```
-    /// use lamellar::darc::prelude::*;
-    /// use lamellar::active_messaging::*;
-    ///
-    /// #[lamellar::AmData(Clone)]
-    /// struct DarcAm {
-    ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
-    /// }
-    ///
-    /// #[lamellar::am]
-    /// impl LamellarAm for DarcAm {
-    ///     async fn exec(self) {
-    ///         let mut counter = self.counter.write().await; // await until we get the write lock
-    ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
-    ///     }
-    ///  }
-    /// //-------------
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    ///
-    /// world.clone().block_on(async move {
-    ///     let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    ///     let mut guard = counter.collective_write().await;
-    ///     *guard += my_pe;
-    ///     drop(guard); //release the lock
-    ///     world.wait_all(); // wait for my active message to return
-    ///     world.barrier(); //at this point all updates will have been performed
-    /// });
-    ///```
-    pub async fn collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-        // println!("async write");
-        let inner = self.inner();
-        let team = inner.team();
-        let remote_rwlock_addr = team.lamellae.remote_addr(
-            0,
-            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-        );
-        let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
-        team.exec_am_pe_tg(
-            0,
-            LockAm {
-                rwlock_addr: remote_rwlock_addr,
-                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                lock_type: LockType::CollectiveWrite(collective_cnt),
-            },
-            Some(inner.am_counters()),
-        )
-        .await;
-        GlobalRwDarcCollectiveWriteGuard {
-            rwlock: self.darc.clone(),
-            collective_cnt: collective_cnt,
-            marker: PhantomData,
-        }
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Launches an active message to gather a global read lock associated with this GlobalRwDarc.
-    ///
-    /// The current THREAD will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while any writer currently has access to the lock, but there may be other readers
-    ///
-    /// Returns ared this specific instance of the read lock will only be held by the calling PE (until it is dropped)
-    /// Other PEs may have separately aquired read locks as well.
-    ///
-    ///
-    /// # Noten RAII guard which will drop the read access of the wrlock when dropped
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    /// Once aqui
-    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::read]
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// // do interesting work
-    /// let guard = counter.blocking_read(); //blocks current thread until aquired
-    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    ///```
-    pub fn blocking_read(&self) -> GlobalRwDarcReadGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_read` from within an async context which may lead to deadlock, it is recommended that you use `read().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        // println!("read");
-
-        let inner = self.inner();
-        let team = inner.team();
-        let remote_rwlock_addr = team.lamellae.remote_addr(
-            0,
-            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
-        );
-        team.exec_am_pe_tg(
-            0,
-            LockAm {
-                rwlock_addr: remote_rwlock_addr,
-                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                lock_type: LockType::Read,
-            },
-            Some(inner.am_counters()),
-        )
-        .blocking_wait();
-        GlobalRwDarcReadGuard {
-            rwlock: self.darc.clone(),
-            marker: PhantomData,
-            local_cnt: Arc::new(AtomicUsize::new(1)),
-        }
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Launches an active message to gather a global write lock associated with this GlobalRwDarc.
-    ///
-    /// The current THREAD will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while another writer or any readers currently have access to the lock
-    ///
-    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    ///
-    /// # One-sided Operation
-    /// The calling PE is responsible for creating and transfering the active message which aquires the lock.
-    /// Once aquired the lock will only be held by the calling PE (until it is dropped)
-    ///
-    /// # Note
-    /// Do not use this function in an asynchronous context (i.e. a Lamellar Active message), instead use [GlobalRwDarc::write]
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// // do interesting work
-    /// let mut guard = counter.blocking_write(); //blocks current thread until aquired
-    /// *guard += my_pe;
-    ///```
-    pub fn blocking_write(&self) -> GlobalRwDarcWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_write` from within an async context which may lead to deadlock, it is recommended that you use `write().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        let inner = self.inner();
-        let team = inner.team();
-        let remote_rwlock_addr = team.lamellae.remote_addr(
-            0,
-            inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        team.exec_am_pe_tg(
-            0,
-            LockAm {
-                rwlock_addr: remote_rwlock_addr,
-                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                lock_type: LockType::Write,
-            },
-            Some(inner.am_counters()),
-        )
-        .blocking_wait();
-        GlobalRwDarcWriteGuard {
-            rwlock: self.darc.clone(),
-            marker: PhantomData,
+        GlobalRwDarcWriteHandle {
+            darc: self.clone(),
+            lock_am: am,
         }
-        // inner.item().write(remote_rwlock_addr)
     }
 
     #[doc(alias("Collective"))]
-    /// Launches an active message to gather the global collective write lock associated with this GlobalRwDarc.
-    ///
-    /// The current task will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while another writer or any readers currently have access to the lock
+    /// Launches an active message to gather a global collective write lock associated with this GlobalRwDarc returning a handle representing this operation.
+    /// The returned handle must either be await'd `.collective_write().await` within an async context
+    /// or it must be blocked on `.collective_write().block()` in a non async context to actually acquire the lock
     ///
-    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// After awaiting or blocking on the handle, a RAII guard is returned which will drop the write access of the wrlock when dropped
     ///
     /// # Collective Operation
     /// All PEs associated with this GlobalRwDarc must enter the lock call otherwise deadlock may occur.
@@ -878,44 +668,16 @@ impl<T> GlobalRwDarc<T> {
     ///
     ///```
     /// use lamellar::darc::prelude::*;
-    /// use lamellar::active_messaging::*;
-    ///
-    /// #[lamellar::AmData(Clone)]
-    /// struct DarcAm {
-    ///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
-    /// }
-    ///
-    /// #[lamellar::am]
-    /// impl LamellarAm for DarcAm {
-    ///     async fn exec(self) {
-    ///         let mut counter = self.counter.write().await; // await until we get the write lock
-    ///         *counter += 1; // although we have the global lock, we are still only modifying the data local to this PE
-    ///     }
-    ///  }
-    /// //-------------
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
     /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
-    /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    /// let mut guard = counter.blocking_collective_write();
+    /// let mut guard = counter.collective_write().block(); // this will block until all PEs have acquired the lock
     /// *guard += my_pe;
-    /// drop(guard); //release the lock
-    /// world.wait_all(); // wait for my active message to return
-    /// world.barrier(); //at this point all updates will have been performed
     ///```
-    pub fn blocking_collective_write(&self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_collective_write` from within an async context which may lead to deadlock, it is recommended that you use `collective_write().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+    pub fn collective_write(&self) -> GlobalRwDarcCollectiveWriteHandle<T> {
+        // println!("async write");
         let inner = self.inner();
         let team = inner.team();
         let remote_rwlock_addr = team.lamellae.remote_addr(
@@ -923,7 +685,7 @@ impl<T> GlobalRwDarc<T> {
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
         let collective_cnt = inner.item().collective_cnt.fetch_add(1, Ordering::SeqCst);
-        team.exec_am_pe_tg(
+        let am = team.exec_am_pe_tg(
             0,
             LockAm {
                 rwlock_addr: remote_rwlock_addr,
@@ -931,12 +693,11 @@ impl<T> GlobalRwDarc<T> {
                 lock_type: LockType::CollectiveWrite(collective_cnt),
             },
             Some(inner.am_counters()),
-        )
-        .blocking_wait();
-        GlobalRwDarcCollectiveWriteGuard {
-            rwlock: self.darc.clone(),
-            collective_cnt: collective_cnt,
-            marker: PhantomData,
+        );
+        GlobalRwDarcCollectiveWriteHandle {
+            darc: self.clone(),
+            collective_cnt,
+            lock_am: am,
         }
     }
 }
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 78ca4da1..c4c2dbbe 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -1,3 +1,4 @@
+use async_lock::futures::ReadArc;
 // use parking_lot::{
 //     lock_api::{ArcRwLockReadGuard, RwLockWriteGuardArc},
 //     RawRwLock, RwLock,
@@ -5,9 +6,17 @@
 use async_lock::{RwLock, RwLockReadGuardArc, RwLockWriteGuardArc};
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use std::fmt;
+use std::marker::PhantomData;
 use std::ptr::NonNull;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
+use std::{
+    future::Future,
+    pin::Pin,
+    task::{Context, Poll, Waker},
+};
+
+use pin_project::pin_project;
 
 use crate::active_messaging::RemotePtr;
 use crate::config;
@@ -15,8 +24,75 @@ use crate::darc::global_rw_darc::{DistRwLock, GlobalRwDarc};
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
 use crate::lamellar_team::IntoLamellarTeam;
+use crate::scheduler::LamellarTask;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
+use super::handle::{LocalRwDarcReadHandle, LocalRwDarcWriteHandle};
+
+#[derive(Debug)]
+pub struct LocalRwDarcReadGuard<T: 'static> {
+    pub(crate) darc: LocalRwDarc<T>,
+    pub(crate) lock: RwLockReadGuardArc<T>,
+}
+
+impl<T: fmt::Display> fmt::Display for LocalRwDarcReadGuard<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.lock, f)
+    }
+}
+
+impl<T> std::ops::Deref for LocalRwDarcReadGuard<T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        &self.lock
+    }
+}
+
+// impl<T> RwDarcGuard<LocalRwDarc<T>> for LocalRwDarcReadGuard<T> {
+//     type Guard = RwLockReadGuardArc<T>;
+//     fn new(darc: LocalRwDarc<T>, lock_guard: Self::Guard) -> Self {
+//         LocalRwDarcReadGuard {
+//             darc,
+//             lock: lock_guard,
+//         }
+//     }
+// }
+
+#[derive(Debug)]
+pub struct LocalRwDarcWriteGuard<T: 'static> {
+    pub(crate) darc: LocalRwDarc<T>,
+    pub(crate) lock: RwLockWriteGuardArc<T>,
+}
+
+impl<T: fmt::Display> fmt::Display for LocalRwDarcWriteGuard<T> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.lock, f)
+    }
+}
+
+impl<T> std::ops::Deref for LocalRwDarcWriteGuard<T> {
+    type Target = T;
+    fn deref(&self) -> &T {
+        &self.lock
+    }
+}
+
+impl<T> std::ops::DerefMut for LocalRwDarcWriteGuard<T> {
+    fn deref_mut(&mut self) -> &mut T {
+        &mut self.lock
+    }
+}
+
+// impl<T> RwDarcGuard<LocalRwDarc<T>> for LocalRwDarcWriteGuard<T> {
+//     type Guard = RwLockWriteGuardArc<T>;
+//     fn new(darc: LocalRwDarc<T>, lock_guard: Self::Guard) -> Self {
+//         LocalRwDarcWriteGuard {
+//             darc,
+//             lock: lock_guard,
+//         }
+//     }
+// }
+
 /// A local read-write `Darc`
 ///
 /// Each PE maintains its own local read-write lock associated with the `LocalRwDarc`.
@@ -127,13 +203,11 @@ impl<T> LocalRwDarc<T> {
 
 impl<T: Sync + Send> LocalRwDarc<T> {
     #[doc(alias("One-sided", "onesided"))]
-    /// Aquires a reader lock of this LocalRwDarc local to this PE.
-    ///
-    /// The current THREAD will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while any writer currentl has access to the lock
-    ///
-    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// Creates a handle for aquiring a reader lock of this LocalRwDarc local to this PE.
+    /// The returned handle must either be await'd `.read().await` within an async context
+    /// or it must be blocked on `.read().block()` in a non async context to actually acquire the lock
+    /// 
+    /// After awaiting or blocking on the handle, a RAII guard is returned which will drop the read access of the wrlock when dropped
     ///
     /// # One-sided Operation
     /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
@@ -163,83 +237,19 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    /// let guard = counter.blocking_read();
+    /// let guard = counter.read().block(); //we can also explicitly block on the lock in a non async context
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
-    pub fn blocking_read(&self) -> RwLockReadGuardArc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_read` from within an async context which may lead to deadlock, it is recommended that you use `read().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        let self_clone: LocalRwDarc<T> = self.clone();
-        self.darc
-            .team()
-            .block_on(async move { self_clone.darc.read_arc().await })
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    /// Aquires a reader lock of this LocalRwDarc local to this PE.
-    ///
-    /// The current THREAD will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while any writer currentl has access to the lock
-    ///
-    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
-    ///
-    /// # One-sided Operation
-    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    ///
-    /// # Note
-    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    ///
-    /// # Examples
-    ///
-    ///```
-    /// use lamellar::darc::prelude::*;
-    /// use lamellar::active_messaging::prelude::*;
-    /// #[lamellar::AmData(Clone)]
-    /// struct DarcAm {
-    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    /// }
-    ///
-    /// #[lamellar::am]
-    /// impl LamellarAm for DarcAm {
-    ///     async fn exec(self) {
-    ///         let counter = self.counter.read().await; //block until we get the write lock
-    ///         println!("the current counter value on pe {} = {}",lamellar::current_pe,counter);
-    ///     }
-    ///  }
-    /// //-------------
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// world.clone().block_on(async move {
-    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
-    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    ///     let guard = counter.read().await;
-    ///     println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
-    /// });
-    ///```
-    pub async fn read(&self) -> RwLockReadGuardArc<T> {
-        // println!("async trying to get read lock");
-        let lock = self.darc.read_arc().await;
-        // println!("got async read lock");
-        lock
+    pub fn read(&self) -> LocalRwDarcReadHandle<T> {
+        LocalRwDarcReadHandle::new(self.clone())
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Aquires the writer lock of this LocalRwDarc local to this PE.
-    ///
-    /// The current THREAD will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while another writer or any readers currently have access to the lock
-    ///
-    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// Creates a handle for aquiring a writer lock of this LocalRwDarc local to this PE.
+    /// The returned handle must either be await'd `.write().await` within an async context
+    /// or it must be blocked on `.write().block()` in a non async context to actually acquire the lock
+    /// 
+    /// After awaiting or blocking on the handle, a RAII guard is returned which will drop the write access of the wrlock when dropped
     ///
     /// # One-sided Operation
     /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
@@ -269,75 +279,12 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// let my_pe = world.my_pe();
     /// let counter = LocalRwDarc::new(&world, 0).unwrap();
     /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    /// let mut guard = counter.blocking_write();
+    /// let mut  guard = counter.write().block(); //we can also explicitly block on the lock in a non async context
     /// *guard += my_pe;
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
-    pub fn blocking_write(&self) -> RwLockWriteGuardArc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_write` from within an async context which may lead to deadlock, it is recommended that you use `write().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        // println!("trying to get write lock");
-        let self_clone: LocalRwDarc<T> = self.clone();
-        self.darc
-            .team()
-            .block_on(async move { self_clone.darc.write_arc().await })
-    }
-
-    #[doc(alias("One-sided", "onesided"))]
-    ///
-    /// Aquires the writer lock of this LocalRwDarc local to this PE.
-    ///
-    /// The current THREAD will be blocked until the lock has been acquired.
-    ///
-    /// This function will not return while another writer or any readers currently have access to the lock
-    ///
-    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
-    ///
-    /// # One-sided Operation
-    /// The calling PE is only aware of its own local lock and does not require coordination with other PEs
-    ///
-    /// # Note
-    /// the aquired lock is only with respect to this PE, the locks on the other PEs will be in their own states
-    ///
-    /// # Examples
-    ///
-    ///```
-    /// use lamellar::darc::prelude::*;
-    /// use lamellar::active_messaging::prelude::*;
-    /// #[lamellar::AmData(Clone)]
-    /// struct DarcAm {
-    ///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
-    /// }
-    ///
-    /// #[lamellar::am]
-    /// impl LamellarAm for DarcAm {
-    ///     async fn exec(self) {
-    ///         let mut counter = self.counter.write().await; //block until we get the write lock
-    ///         *counter += 1;
-    ///     }
-    ///  }
-    /// //-------------
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let my_pe = world.my_pe();
-    /// world.clone().block_on(async move{
-    ///     let counter = LocalRwDarc::new(&world, 0).unwrap();
-    ///     let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
-    ///     let mut guard = counter.write().await;
-    ///     *guard += my_pe;
-    /// })
-    ///```
-    pub async fn write(&self) -> RwLockWriteGuardArc<T> {
-        // println!("async trying to get write lock");
-        let lock = self.darc.write_arc().await;
-        // println!("got async write lock");
-        lock
+    pub fn write(&self) -> LocalRwDarcWriteHandle<T> {
+        LocalRwDarcWriteHandle::new(self.clone())
     }
 }
 
@@ -626,14 +573,7 @@ impl<T> Clone for LocalRwDarc<T> {
 impl<T: fmt::Display + Sync + Send> fmt::Display for LocalRwDarc<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let lock: LocalRwDarc<T> = self.clone();
-        fmt::Display::fmt(
-            &self
-                .darc
-                .team()
-                .scheduler
-                .block_on(async move { lock.read().await }),
-            f,
-        )
+        fmt::Display::fmt(&lock.read().block(), f)
     }
 }
 
diff --git a/src/lamellae/rofi/rofi_comm.rs b/src/lamellae/rofi/rofi_comm.rs
index b6025ebc..abee9744 100644
--- a/src/lamellae/rofi/rofi_comm.rs
+++ b/src/lamellae/rofi/rofi_comm.rs
@@ -53,10 +53,6 @@ impl RofiComm {
     //#[tracing::instrument(skip_all)]
     pub(crate) fn new(provider: &str, domain: &str) -> RofiComm {
         if let Some(size) = config().heap_size {
-            // if let Ok(size) = std::env::var("LAMELLAR_MEM_SIZE") {
-            // let size = size
-            //     .parse::<usize>()
-            //     .expect("invalid memory size, please supply size in bytes");
             ROFI_MEM.store(size, Ordering::SeqCst);
         }
         rofi_init(provider, domain).expect("error in rofi init");
@@ -641,7 +637,7 @@ impl Drop for RofiComm {
             println!("dropping rofi -- memory in use {:?}", self.occupied());
         }
         if self.alloc.read().len() > 1 {
-            println!("[LAMELLAR INFO] {:?} additional rt memory pools were allocated, performance may be increased using a larger initial pool, set using the LAMELLAR_MEM_SIZE envrionment variable. Current initial size = {:?}",self.alloc.read().len()-1, ROFI_MEM.load(Ordering::SeqCst));
+            println!("[LAMELLAR INFO] {:?} additional rt memory pools were allocated, performance may be increased using a larger initial pool, set using the LAMELLAR_HEAP_SIZE envrionment variable. Current initial size = {:?}",self.alloc.read().len()-1, ROFI_MEM.load(Ordering::SeqCst));
         }
         // let _lock = self.comm_mutex.write();
         rofi_barrier();
diff --git a/src/lamellae/shmem/shmem_comm.rs b/src/lamellae/shmem/shmem_comm.rs
index 2b470238..f2c99d6a 100644
--- a/src/lamellae/shmem/shmem_comm.rs
+++ b/src/lamellae/shmem/shmem_comm.rs
@@ -313,10 +313,6 @@ impl ShmemComm {
             Err(_e) => 0,
         };
         if let Some(size) = config().heap_size {
-            //std::env::var("LAMELLAR_MEM_SIZE") {
-            // let size = size
-            //     .parse::<usize>()
-            //     .expect("invalid memory size, please supply size in bytes");
             SHMEM_SIZE.store(size, Ordering::SeqCst);
         }
 
@@ -431,7 +427,7 @@ impl CommOps for ShmemComm {
         // if let Some(addr) = self.alloc.try_malloc(size) {
         //     Some(addr)
         // } else {
-        //     println!("[WARNING] out of memory: (work in progress on a scalable solution, as a work around try setting the LAMELLAR_MEM_SIZE envrionment variable (current size = {:?} -- Note: LamellarLocalArrays are currently allocated out of this pool",SHMEM_SIZE.load(Ordering::SeqCst));
+        //     println!("[WARNING] out of memory: (work in progress on a scalable solution, as a work around try setting the LAMELLAR_HEAP_SIZE envrionment variable (current size = {:?} -- Note: LamellarLocalArrays are currently allocated out of this pool",SHMEM_SIZE.load(Ordering::SeqCst));
         //     None
         // }
     }
@@ -593,7 +589,7 @@ impl Drop for ShmemComm {
             println!("dropping rofi -- memory in use {:?}", self.occupied());
         }
         if self.alloc.read().len() > 1 {
-            println!("[LAMELLAR INFO] {:?} additional rt memory pools were allocated, performance may be increased using a larger initial pool, set using the LAMELLAR_MEM_SIZE envrionment variable. Current initial size = {:?}",self.alloc.read().len()-1, SHMEM_SIZE.load(Ordering::SeqCst));
+            println!("[LAMELLAR INFO] {:?} additional rt memory pools were allocated, performance may be increased using a larger initial pool, set using the LAMELLAR_HEAP_SIZE envrionment variable. Current initial size = {:?}",self.alloc.read().len()-1, SHMEM_SIZE.load(Ordering::SeqCst));
         }
     }
 }

From 13c68332c968c65b54cfab02750ae1202e2150e0 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 14 Oct 2024 11:25:01 -0700
Subject: [PATCH 096/116] refactor lock based arrays to utilize handles with a
 block method, rather than 'blocking_*' apis

---
 src/array.rs                           |   6 +-
 src/array/global_lock_atomic.rs        | 321 +++------------
 src/array/global_lock_atomic/handle.rs | 404 +++++++++++++++++++
 src/array/local_lock_atomic.rs         | 221 ++--------
 src/array/local_lock_atomic/handle.rs  | 322 +++++++++++++++
 src/darc/handle.rs                     | 531 +++++++++++++++++++++++++
 src/scheduler.rs                       |  20 +-
 src/scheduler/async_std_executor.rs    |   7 +-
 src/scheduler/tokio_executor.rs        |   5 +-
 src/scheduler/work_stealing.rs         |   7 +-
 src/scheduler/work_stealing2.rs        |   7 +-
 src/scheduler/work_stealing3.rs        |   7 +-
 12 files changed, 1390 insertions(+), 468 deletions(-)
 create mode 100644 src/array/global_lock_atomic/handle.rs
 create mode 100644 src/array/local_lock_atomic/handle.rs
 create mode 100644 src/darc/handle.rs

diff --git a/src/array.rs b/src/array.rs
index e5bf53c3..b4f68a68 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -1456,7 +1456,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block);
     /// let _ = array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = my_pe).block(); //we will used this val as completion detection
     /// array.barrier();
-    /// println!("PE{my_pe} array data: {:?}",array.blocking_read_local_data());
+    /// println!("PE{my_pe} array data: {:?}",array.read_local_data().block());
     /// let index = ((my_pe+1)%num_pes) * array.num_elems_local(); // get first index on PE to the right (with wrap arround)
     /// let at_req = array.at(index);
     /// let val = array.block_on(at_req);
@@ -1545,14 +1545,14 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     /// }
     /// array.wait_all();
     /// array.barrier();
-    /// println!("PE{my_pe} array data: {:?}",array.blocking_read_local_data());
+    /// println!("PE{my_pe} array data: {:?}",array.read_local_data().block());
     /// if my_pe == 0 { //only perfrom the transfer from one PE
     ///     array.block_on( unsafe {  array.put(0,&buf) } );
     ///     println!();
     /// }
     /// array.barrier(); //block other PEs until PE0 has finised "putting" the data
     ///    
-    /// println!("PE{my_pe} array data: {:?}",array.blocking_read_local_data());
+    /// println!("PE{my_pe} array data: {:?}",array.read_local_data().block());
     ///     
     ///
     ///```
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 705941ed..d722afad 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -205,7 +205,7 @@ impl<T: Dist> GlobalLockLocalData<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.read_local_data.block();
+    /// let local_data = array.read_local_data().block();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
     /// assert_eq!(local_data[10],sub_data[0]);
     ///```
@@ -358,52 +358,11 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return a global read lock guard on the calling PE
-    // ///
-    // /// this function will block the thread until the lock is acquired
-    // /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::GlobalLockArray::read_lock) instead.
-    // ///
-    // /// # One-sided Operation
-    // /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
-    // ///
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let read_lock = array.blocking_read_lock();
-    // /// //do interesting work
-    // ///
-    // ///```
-    // pub fn blocking_read_lock(&self) -> GlobalLockReadGuard<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: GlobalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         GlobalLockReadGuard {
-    //             array: self_clone.clone(),
-    //             lock_guard: self_clone.lock.read().await,
-    //         }
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// Return a global read lock guard on the calling PE
+    /// Return a handle for aquiring a global read lock guard on the calling PE
     ///
-    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    /// the returned handle must be await'd `.read_lock().await` within an async context or
+    /// it must be blocked on `.read_lock().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
@@ -414,62 +373,23 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// world.block_on(async move {
-    ///     let read_lock = array.read_lock().await;
+    /// let handle = array.read_lock();
+    /// let task = world.spawn(async move {
+    ///     let read_lock = handle.await;
     ///     //do interesting work
     /// });
+    /// array.read_lock().block();
+    /// task.block();
     ///```
     pub fn read_lock(&self) -> GlobalLockReadHandle<T> {
         GlobalLockReadHandle::new(self.clone())
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return a global write lock guard on the calling PE
-    // ///
-    // /// this function will block the thread until the lock is acquired
-    // /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::GlobalLockArray::write_lock) instead.
-    // ///
-    // /// # One-sided Operation
-    // /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
-    // ///
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let write_lock = array.blocking_write_lock();
-    // /// //do interesting work
-    // ///
-    // ///```
-    // pub fn blocking_write_lock(&self) -> GlobalLockWriteGuard<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: GlobalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         GlobalLockWriteGuard {
-    //             array: self_clone.clone(),
-    //             lock_guard: self_clone.lock.write().await,
-    //         }
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// Return a global write lock guard on the calling PE
+    /// Return a handle for aquiring a global write lock guard on the calling PE
     ///
-    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    /// The returned handle must be await'd `.write_lock().await` within an async context or
+    /// it must be blocked on `.write_lock().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only explictly requires the calling PE, although the global lock may be managed by other PEs
@@ -480,67 +400,23 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// world.block_on(async move {
-    ///     let write_lock = array.write_lock().await;
+    /// let handle = array.write_lock();
+    /// let task = world.spawn(async move {
+    ///     let write_lock = handle.await;
     ///     //do interesting work
     /// });
+    /// array.write_lock().block();
+    /// task.block();
     ///```
     pub fn write_lock(&self) -> GlobalLockWriteHandle<T> {
         GlobalLockWriteHandle::new(self.clone())
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data.block();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn blocking_read_local_data(&self) -> GlobalLockLocalData<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: GlobalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         GlobalLockLocalData {
-    //             array: self_clone.clone(),
-    //             start_index: 0,
-    //             end_index: self_clone.array.num_elems_local(),
-    //             // lock: self_clone.lock.clone(),
-    //             lock_guard: self_clone.lock.read().await,
-    //         }
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
     ///
-    /// Calling this function will result in a local read lock being captured on the array
-    ///
-    /// This version is intended to be used within asynchronous contexts to prevent blocking the worker threads
+    /// The returned handle must be await'd `.read_local_data().await` within an async context or
+    /// it must be blocked on `.read_local_data().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only returns local data on the calling PE
@@ -550,12 +426,15 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// world.clone().block_on(async move {
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    /// let local_data = array.read_local_data().await;
-    /// println!("PE{my_pe} data: {local_data:?}");
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_local_data();
+    /// world.spawn(async move {
+    ///     let local_data = handle.await;
+    ///     println!("PE{my_pe} data: {local_data:?}");
     /// });
+    /// let local_data = array.read_local_data().block();
+    /// println!("PE{my_pe} data: {local_data:?}");
     ///```
     pub fn read_local_data(&self) -> GlobalLockLocalDataHandle<T> {
         GlobalLockLocalDataHandle {
@@ -567,59 +446,11 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the global write lock being captured on the array.
-    // ///.
-    // /// This function is blocking and intended to be called from non asynchronous contexts.
-    // /// Calling within an asynchronous block may lead to deadlock.
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.blocking_write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn blocking_write_local_data(&self) -> GlobalLockMutLocalData<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: GlobalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         let lock = self_clone.lock.write().await;
-    //         let data = GlobalLockMutLocalData {
-    //             array: self_clone.clone(),
-    //             start_index: 0,
-    //             end_index: self_clone.array.num_elems_local(),
-    //             lock_guard: lock,
-    //         };
-    //         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //         data
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
-    ///
-    /// Calling this function will result in the global write lock being captured on the array
+    /// Return a handle for accessing the calling PE's local data as a  [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
     ///
-    /// This version is intended to be used within asynchronous contexts to prevent blocking the worker threads
+    /// The returned handle must be await'd `.write_local_data().await` within an async context or
+    /// it must be blocked on `.write_local_data().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only returns (mutable) local data on the calling PE
@@ -629,12 +460,15 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// world.clone().block_on(async move {
-    ///     let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    ///     let local_data = array.write_local_data().await;
-    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_local_data();
+    /// world.spawn(async move {
+    ///     let mut local_data = handle.await;
+    ///     local_data.iter_mut().for_each(|elem| *elem += my_pe);
     /// });
+    /// let mut local_data = array.write_local_data().block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn write_local_data(&self) -> GlobalLockMutLocalDataHandle<T> {
         GlobalLockMutLocalDataHandle {
@@ -645,55 +479,12 @@ impl<T: Dist> GlobalLockArray<T> {
         }
     }
 
-    // /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the collective write lock being captured on the array
-    // ///
-    // /// # Collective Operation
-    // /// All PEs associated with this array must enter the call, otherwise deadlock will occur.
-    // /// Upon return every PE will hold a special collective write lock so that they can all access their local data simultaneous
-    // /// This lock prevents any other access from occuring on the array until it is dropped on all the PEs.
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.blocking_collective_write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn blocking_collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalData<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `GlobalLockArray::blocking_collective_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `collective_write_local_data().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: GlobalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         let lock = self_clone.lock.collective_write().await;
-    //         let data = GlobalLockCollectiveMutLocalData {
-    //             array: self_clone.clone(),
-    //             start_index: 0,
-    //             end_index: self_clone.array.num_elems_local(),
-    //             _lock_guard: lock,
-    //         };
-    //         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //         data
-    //     })
-    // }
-
     #[doc(alias("Collective"))]
-    /// Return the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
+    /// All PEs associated with the array must call this function in order to access their own local data simultaneously
     ///
-    /// Calling this function will result in the collective write lock being captured on the array
+    /// The returned handle must be await'd `.collective_write_local_data().await` within an async context or
+    /// it must be blocked on `.collective_write_local_data().block()` in a non async context to actually acquire the lock
     ///
     /// # Collective Operation
     /// All PEs associated with this array must enter the call, otherwise deadlock will occur.
@@ -705,12 +496,16 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// world.clone().block_on(async move {
-    ///    let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    ///    let local_data = array.collective_write_local_data().await;
-    ///    println!("PE{my_pe} data: {local_data:?}");
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.collective_write_local_data();
+    /// world.block_on(async move {
+    ///     let mut local_data = handle.await;
+    ///     local_data.iter_mut().for_each(|elem| *elem += my_pe);
     /// });
+    /// let mut local_data = array.collective_write_local_data().block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+    ///```
     ///```
     pub fn collective_write_local_data(&self) -> GlobalLockCollectiveMutLocalDataHandle<T> {
         GlobalLockCollectiveMutLocalDataHandle {
@@ -759,7 +554,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.read_local_data.block();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_unsafe" call
     /// // but array1 will not be dropped until after 'slice' is dropped.
@@ -807,7 +602,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.read_local_data.block();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -851,7 +646,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.read_local_data.block();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_read_only" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -897,7 +692,7 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     ///
     /// let array1 = array.clone();
-    /// let slice = array1.read_local_data.block();
+    /// let slice = array1.read_local_data().block();
     ///
     /// // no borrows to this specific instance (array) so it can enter the "into_atomic" call
     /// // but array1 will not be dropped until after mut_slice is dropped.
@@ -1284,7 +1079,7 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.reduce("prod")).expect("array has > 0 elements");
     ///```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1317,7 +1112,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let sum = array.block_on(read_guard.sum()).expect("array has > 0 elements");
     /// ```
     #[must_use = "this function is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -1345,7 +1140,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i+1));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.prod()).expect("array has > 0 elements");
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
@@ -1375,7 +1170,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let max = array.block_on(read_guard.max()).expect("array has > 0 elements");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
@@ -1404,7 +1199,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// let num_pes = world.num_pes();
     /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
-    /// let read_guard = array.blocking_read_lock();
+    /// let read_guard = array.read_lock().block();
     /// let min = array.block_on(read_guard.min()).expect("array has > 0 elements");
     /// assert_eq!(0,min);
     ///```
diff --git a/src/array/global_lock_atomic/handle.rs b/src/array/global_lock_atomic/handle.rs
new file mode 100644
index 00000000..d15e7da1
--- /dev/null
+++ b/src/array/global_lock_atomic/handle.rs
@@ -0,0 +1,404 @@
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use crate::config;
+use crate::darc::handle::{
+    GlobalRwDarcCollectiveWriteHandle, GlobalRwDarcReadHandle, GlobalRwDarcWriteHandle,
+};
+use crate::Dist;
+use crate::GlobalLockArray;
+
+use futures_util::Future;
+use pin_project::pin_project;
+
+use super::{
+    GlobalLockCollectiveMutLocalData, GlobalLockLocalData, GlobalLockMutLocalData,
+    GlobalLockReadGuard, GlobalLockWriteGuard,
+};
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired read lock of a GlobalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any writer currently has access to the lock, but there may be other readers
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.read_lock();
+/// let task = world.spawn(async move {
+///     let read_lock = handle.await;
+///     //do interesting work
+/// });
+/// array.read_lock().block();
+/// task.block();
+///```
+pub struct GlobalLockReadHandle<T> {
+    pub(crate) array: GlobalLockArray<T>,
+    #[pin]
+    pub(crate) lock_handle: GlobalRwDarcReadHandle<()>,
+}
+
+impl<T: Dist> GlobalLockReadHandle<T> {
+    pub(crate) fn new(array: GlobalLockArray<T>) -> Self {
+        Self {
+            array: array.clone(),
+            lock_handle: array.lock.read(),
+        }
+    }
+    /// Handle used to retrieve the aquired read lock of a GlobalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_lock();
+    /// let guard = handle.block();
+    ///```
+    pub fn block(self) -> GlobalLockReadGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalLockReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for GlobalLockReadHandle<T> {
+    type Output = GlobalLockReadGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(GlobalLockReadGuard {
+                array: this.array.clone(),
+                lock_guard: val,
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired local data [GlobalLockLocalData] of  a GlobalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any writer currently has access to the lock, but there may be other readers
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+///
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.read_local_data();
+/// world.spawn(async move {
+///     let  local_data = handle.await;
+///     println!("PE{my_pe}, local_data: {:?}", local_data);
+/// });
+/// let mut local_data = array.read_local_data().block();
+/// println!("PE{my_pe}, local_data: {:?}", local_data);
+///```
+pub struct GlobalLockLocalDataHandle<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
+    pub(crate) start_index: usize,
+    pub(crate) end_index: usize,
+    #[pin]
+    pub(crate) lock_handle: GlobalRwDarcReadHandle<()>,
+}
+
+impl<T: Dist> GlobalLockLocalDataHandle<T> {
+    /// Handle used to retrieve the aquired local data [GlobalLockLocalData] of a GlobalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_local_data();
+    /// let  local_data = handle.block();
+    /// println!("local data: {:?}",local_data);
+    ///```
+    pub fn block(self) -> GlobalLockLocalData<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for GlobalLockLocalDataHandle<T> {
+    type Output = GlobalLockLocalData<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(GlobalLockLocalData {
+                array: this.array.clone(),
+                start_index: *this.start_index,
+                end_index: *this.end_index,
+                // lock: self.lock.clone(),
+                lock_guard: val,
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired write lock of a GlobalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers or writer currently has access to the lock
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.write_lock();
+/// let task = world.spawn(async move {
+///     let write_lock = handle.await;
+///     //do interesting work
+/// });
+/// array.write_lock().block();
+/// task.block();
+///```
+pub struct GlobalLockWriteHandle<T> {
+    pub(crate) array: GlobalLockArray<T>,
+    #[pin]
+    pub(crate) lock_handle: GlobalRwDarcWriteHandle<()>,
+}
+
+impl<T: Dist> GlobalLockWriteHandle<T> {
+    pub(crate) fn new(array: GlobalLockArray<T>) -> Self {
+        Self {
+            array: array.clone(),
+            lock_handle: array.lock.write(),
+        }
+    }
+    /// Handle used to retrieve the aquired write lock of a GlobalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_lock();
+    /// handle.block();
+    ///```
+    pub fn block(self) -> GlobalLockWriteGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalLockWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for GlobalLockWriteHandle<T> {
+    type Output = GlobalLockWriteGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(GlobalLockWriteGuard {
+                array: this.array.clone(),
+                lock_guard: val,
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of  a GlobalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers or writer currently has access to the lock
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+///
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.write_local_data();
+/// world.spawn(async move {
+///     let mut local_data = handle.await;
+///     local_data.iter_mut().for_each(|elem| *elem += my_pe);
+/// });
+/// let mut local_data = array.write_local_data().block();
+/// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+///```
+pub struct GlobalLockMutLocalDataHandle<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
+    pub(crate) start_index: usize,
+    pub(crate) end_index: usize,
+    #[pin]
+    pub(crate) lock_handle: GlobalRwDarcWriteHandle<()>,
+}
+
+impl<T: Dist> GlobalLockMutLocalDataHandle<T> {
+    /// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of a GlobalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_local_data();
+    /// let mut local_data = handle.block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+    ///```
+    pub fn block(self) -> GlobalLockMutLocalData<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for GlobalLockMutLocalDataHandle<T> {
+    type Output = GlobalLockMutLocalData<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(GlobalLockMutLocalData {
+                array: this.array.clone(),
+                start_index: *this.start_index,
+                end_index: *this.end_index,
+                // lock: self.lock.clone(),
+                lock_guard: val,
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of a GlobalLockArray with all PEs collectively accessing their local data
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers or writer currently has access to the lock
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+///
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.collective_write_local_data();
+/// world.block_on(async move {
+///     let mut local_data = handle.await;
+///     local_data.iter_mut().for_each(|elem| *elem += my_pe);
+/// });
+/// let mut local_data = array.collective_write_local_data().block();
+/// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+///```
+pub struct GlobalLockCollectiveMutLocalDataHandle<T: Dist> {
+    pub(crate) array: GlobalLockArray<T>,
+    pub(crate) start_index: usize,
+    pub(crate) end_index: usize,
+    #[pin]
+    pub(crate) lock_handle: GlobalRwDarcCollectiveWriteHandle<()>,
+}
+
+impl<T: Dist> GlobalLockCollectiveMutLocalDataHandle<T> {
+    /// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of a GlobalLockArray within a non async context
+    /// with all PEs collectively accessing their local data
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.collective_write_local_data();
+    /// let mut local_data = handle.block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+    ///```
+    pub fn block(self) -> GlobalLockCollectiveMutLocalData<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalLockCollectiveMutLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for GlobalLockCollectiveMutLocalDataHandle<T> {
+    type Output = GlobalLockCollectiveMutLocalData<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(GlobalLockCollectiveMutLocalData {
+                array: this.array.clone(),
+                start_index: *this.start_index,
+                end_index: *this.end_index,
+                // lock: self.lock.clone(),
+                _lock_guard: val,
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index ddeb6bbe..f13d8515 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -357,51 +357,11 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local read lock guard
-    // ///
-    // /// this function will block the thread until the lock is acquired
-    // /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::LocalLockArray::read_lock) instead.
-    // ///
-    // /// # One-sided Operation
-    // /// Only explictly requires the calling PE
-    // ///
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let read_lock = array.read_lock().block();
-    // /// //do interesting work
-    // ///
-    // pub fn blocking_read_lock(&self) -> LocalLockReadGuard<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_lock` from within an async context which may lead to deadlock, it is recommended that you use `read_lock().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: LocalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         LocalLockReadGuard {
-    //             array: self_clone.clone(),
-    //             lock_guard: Arc::new(self_clone.lock.read().await),
-    //         }
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local read lock
+    /// Return a handle for aquiring a local read lock guard on the calling PE
     ///
-    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    /// the returned handle must be await'd `.read_lock().await` within an async context or
+    /// it must be blocked on `.read_lock().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only explictly requires the calling PE
@@ -412,61 +372,22 @@ impl<T: Dist> LocalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    ///
-    /// world.block_on(async move {
-    ///     let read_lock = array.read_lock().await;
+    /// let handle = array.read_lock();
+    /// world.spawn(async move {
+    ///     let read_lock = handle.await;
     ///     //do interesting work
     /// });
+    /// array.read_lock().block();
     ///```
     pub fn read_lock(&self) -> LocalLockReadHandle<T> {
         LocalLockReadHandle::new(self.clone())
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local write lock guard
-    // ///
-    // /// this function will block the thread until the lock is acquired
-    // /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::LocalLockArray::write_lock) instead.
-    // ///
-    // /// # One-sided Operation
-    // /// Only explictly requires the calling PE
-    // ///
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array:LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let write_lock = array.blocking_write_lock();
-    // /// //do interesting work
-    // ///
-    // pub fn blocking_write_lock(&self) -> LocalLockWriteGuard<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_lock` from within an async context which may lead to deadlock, it is recommended that you use `write_lock().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: LocalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         LocalLockWriteGuard {
-    //             array: self_clone.clone(),
-    //             lock_guard: self_clone.lock.write().await,
-    //         }
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local write lock
+    /// Return a handle for aquiring a local write lock guard on the calling PE
     ///
-    /// this function will block the calling task until the lock is acquired (but not the calling thread)
+    /// The returned handle must be await'd `.write_lock().await` within an async context or
+    /// it must be blocked on `.write_lock().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only explictly requires the calling PE
@@ -477,64 +398,22 @@ impl<T: Dist> LocalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    /// let array_clone = array.clone();
+    /// let handle = array.write_lock();
     /// world.spawn(async move {
-    ///     let write_lock = array_clone.write_lock().await;
+    ///     let write_lock = handle.await;
     ///     //do interesting work
     /// });
     /// array.write_lock().block();
-    /// //do interesting work
     ///```
     pub fn write_lock(&self) -> LocalLockWriteHandle<T> {
         LocalLockWriteHandle::new(self.clone())
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
-    // ///
-    // /// Calling this function will result in a local read lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.read_local_data().block();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn blocking_read_local_data(&self) -> LocalLockLocalData<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_data` from within an async context which may lead to deadlock, it is recommended that you use `read_local_data().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: LocalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         LocalLockLocalData {
-    //             array: self_clone.clone(),
-    //             // lock: self_clone.lock.clone(),
-    //             start_index: 0,
-    //             end_index: self_clone.num_elems_local(),
-    //             lock_guard: Arc::new(self_clone.lock.read().await),
-    //         }
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
     ///
-    /// Calling this function will result in a local read lock being captured on the array
+    /// The returned handle must be await'd `.read_local_data().await` within an async context or
+    /// it must be blocked on `.read_local_data().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only returns local data on the calling PE
@@ -544,12 +423,15 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// world.clone().block_on(async move {
-    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    ///     let local_data = array.read_local_data().await;
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_local_data();
+    /// world.spawn(async move {
+    ///     let local_data = handle.await;
     ///     println!("PE{my_pe} data: {local_data:?}");
     /// });
+    /// let local_data = array.read_local_data().block();
+    /// println!("PE{my_pe} data: {local_data:?}");
     ///```
     pub fn read_local_data(&self) -> LocalLockLocalDataHandle<T> {
         LocalLockLocalDataHandle {
@@ -560,55 +442,11 @@ impl<T: Dist> LocalLockArray<T> {
         }
     }
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
-    // ///
-    // /// Calling this function will result in the local write lock being captured on the array
-    // ///
-    // /// # One-sided Operation
-    // /// Only returns (mutable) local data on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let my_pe = world.my_pe();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
-    // ///
-    // /// let local_data = array.blocking_write_local_data();
-    // /// println!("PE{my_pe} data: {local_data:?}");
-    // ///```
-    // pub fn blocking_write_local_data(&self) -> LocalLockMutLocalData<T> {
-    //     if std::thread::current().id() != *crate::MAIN_THREAD {
-    //         if let Some(val) = config().blocking_call_warning {
-    //             if val {
-    //                 println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
-    //                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //             }
-    //         } else {
-    //             println!("[LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_data` from within an async context which may lead to deadlock, it is recommended that you use `write_local_data().await;` instead!
-    //             Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-    //         }
-    //     }
-    //     let self_clone: LocalLockArray<T> = self.clone();
-    //     self.block_on(async move {
-    //         let lock = self_clone.lock.write().await;
-    //         let data = LocalLockMutLocalData {
-    //             array: self_clone.clone(),
-    //             start_index: 0,
-    //             end_index: self_clone.num_elems_local(),
-    //             lock_guard: lock,
-    //         };
-    //         // println!("got lock! {:?} {:?}",std::thread::current().id(),std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH));
-    //         data
-    //     })
-    // }
-
     #[doc(alias("One-sided", "onesided"))]
-    /// TODO: UPDATE
-    /// Return the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.   
     ///
-    /// Calling this function will result in the local write lock being captured on the array
+    /// The returned handle must be await'd `.write_local_data().await` within an async context or
+    /// it must be blocked on `.write_local_data().block()` in a non async context to actually acquire the lock
     ///
     /// # One-sided Operation
     /// Only returns (mutable) local data on the calling PE
@@ -618,12 +456,15 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// world.clone().block_on(async move {
-    ///     let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
     ///
-    ///     let local_data = array.write_local_data().await;
-    ///     println!("PE{my_pe} data: {local_data:?}");
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_local_data();
+    /// world.spawn(async move {
+    ///     let mut local_data = handle.await;
+    ///     local_data.iter_mut().for_each(|elem| *elem += my_pe);
     /// });
+    /// let mut local_data = array.write_local_data().block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn write_local_data(&self) -> LocalLockMutLocalDataHandle<T> {
         LocalLockMutLocalDataHandle {
diff --git a/src/array/local_lock_atomic/handle.rs b/src/array/local_lock_atomic/handle.rs
new file mode 100644
index 00000000..23111fae
--- /dev/null
+++ b/src/array/local_lock_atomic/handle.rs
@@ -0,0 +1,322 @@
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use crate::config;
+use crate::darc::handle::{LocalRwDarcReadHandle, LocalRwDarcWriteHandle};
+use crate::Dist;
+use crate::LocalLockArray;
+
+use futures_util::Future;
+use pin_project::pin_project;
+
+use super::{LocalLockLocalData, LocalLockMutLocalData, LocalLockReadGuard, LocalLockWriteGuard};
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired read lock of a LocalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any writer currently has access to the lock, but there may be other readers
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.read_lock();
+/// let task = world.spawn(async move {
+///     let read_lock = handle.await;
+///     //do interesting work
+/// });
+/// array.read_lock().block();
+/// task.block();
+///```
+pub struct LocalLockReadHandle<T> {
+    pub(crate) array: LocalLockArray<T>,
+    #[pin]
+    pub(crate) lock_handle: LocalRwDarcReadHandle<()>,
+}
+
+impl<T: Dist> LocalLockReadHandle<T> {
+    pub(crate) fn new(array: LocalLockArray<T>) -> Self {
+        Self {
+            array: array.clone(),
+            lock_handle: array.lock.read(),
+        }
+    }
+
+    /// Handle used to retrieve the aquired read lock of a LocalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_lock();
+    /// let guard = handle.block();
+    ///```
+    pub fn block(self) -> LocalLockReadGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for LocalLockReadHandle<T> {
+    type Output = LocalLockReadGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(LocalLockReadGuard {
+                array: this.array.clone(),
+                lock_guard: Arc::new(val),
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired local data [LocalLockLocalData] of  a LocalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any writer currently has access to the lock, but there may be other readers
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+///
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.read_local_data();
+/// world.spawn(async move {
+///     let  local_data = handle.await;
+///     println!("PE{my_pe}, local_data: {:?}", local_data);
+/// });
+/// let mut local_data = array.read_local_data().block();
+/// println!("PE{my_pe}, local_data: {:?}", local_data);
+///```
+pub struct LocalLockLocalDataHandle<T: Dist> {
+    pub(crate) array: LocalLockArray<T>,
+    pub(crate) start_index: usize,
+    pub(crate) end_index: usize,
+    #[pin]
+    pub(crate) lock_handle: LocalRwDarcReadHandle<()>,
+}
+
+impl<T: Dist> LocalLockLocalDataHandle<T> {
+    /// Handle used to retrieve the aquired local data [LocalLockLocalData] of a LocalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_local_data();
+    /// let  local_data = handle.block();
+    /// println!("local data: {:?}",local_data);
+    ///```
+    pub fn block(self) -> LocalLockLocalData<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for LocalLockLocalDataHandle<T> {
+    type Output = LocalLockLocalData<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(LocalLockLocalData {
+                array: this.array.clone(),
+                start_index: *this.start_index,
+                end_index: *this.end_index,
+                // lock: self.lock.clone(),
+                lock_guard: Arc::new(val),
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired write lock of a LocalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers or writer currently has access to the lock
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.write_lock();
+/// let task = world.spawn(async move {
+///     let write_lock = handle.await;
+///     //do interesting work
+/// });
+/// array.write_lock().block();
+/// task.block();
+///```
+pub struct LocalLockWriteHandle<T> {
+    pub(crate) array: LocalLockArray<T>,
+    #[pin]
+    pub(crate) lock_handle: LocalRwDarcWriteHandle<()>,
+}
+
+impl<T: Dist> LocalLockWriteHandle<T> {
+    pub(crate) fn new(array: LocalLockArray<T>) -> Self {
+        Self {
+            array: array.clone(),
+            lock_handle: array.lock.write(),
+        }
+    }
+    /// Handle used to retrieve the aquired write lock of a LocalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_lock();
+    /// handle.block();
+    ///```
+    pub fn block(self) -> LocalLockWriteGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for LocalLockWriteHandle<T> {
+    type Output = LocalLockWriteGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(LocalLockWriteGuard {
+                array: this.array.clone(),
+                lock_guard: val,
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired mutable local data [LocalLockMutLocalData] of  a LocalLockArray
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers or writer currently has access to the lock
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+///
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let handle = array.write_local_data();
+/// world.spawn(async move {
+///     let mut local_data = handle.await;
+///     local_data.iter_mut().for_each(|elem| *elem += my_pe);
+/// });
+/// let mut local_data = array.write_local_data().block();
+/// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+///```
+pub struct LocalLockMutLocalDataHandle<T: Dist> {
+    pub(crate) array: LocalLockArray<T>,
+    pub(crate) start_index: usize,
+    pub(crate) end_index: usize,
+    #[pin]
+    pub(crate) lock_handle: LocalRwDarcWriteHandle<()>,
+}
+
+impl<T: Dist> LocalLockMutLocalDataHandle<T> {
+    /// Handle used to retrieve the aquired mutable local data [LocalLockMutLocalData] of a LocalLockArray within a non async context
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_local_data();
+    /// let mut local_data = handle.block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+    ///```
+    pub fn block(self) -> LocalLockMutLocalData<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for LocalLockMutLocalDataHandle<T> {
+    type Output = LocalLockMutLocalData<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(LocalLockMutLocalData {
+                array: this.array.clone(),
+                start_index: *this.start_index,
+                end_index: *this.end_index,
+                // lock: self.lock.clone(),
+                lock_guard: val,
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
new file mode 100644
index 00000000..db1b4274
--- /dev/null
+++ b/src/darc/handle.rs
@@ -0,0 +1,531 @@
+use std::marker::PhantomData;
+use std::pin::Pin;
+use std::sync::atomic::AtomicUsize;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use crate::darc::local_rw_darc::{LocalRwDarc, LocalRwDarcReadGuard};
+use crate::lamellar_request::LamellarRequest;
+use crate::AmHandle;
+use crate::{config, GlobalRwDarc};
+
+use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
+use futures_util::{ready, Future};
+use pin_project::pin_project;
+
+use super::global_rw_darc::{
+    GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
+};
+use super::local_rw_darc::LocalRwDarcWriteGuard;
+
+#[pin_project(project = StateProj)]
+enum State<T> {
+    Init,
+    TryingRead(#[pin] Pin<Box<dyn Future<Output = RwLockReadGuardArc<T>> + Send + 'static>>),
+    TryingWrite(#[pin] Pin<Box<dyn Future<Output = RwLockWriteGuardArc<T>> + Send + 'static>>),
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired read lock from a LocalRwDarc
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any writer currently has access to the lock, but there may be other readers
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///
+///```
+/// use lamellar::darc::prelude::*;
+/// use lamellar::active_messaging::*;
+///
+/// #[lamellar::AmData(Clone)]
+/// struct DarcAm {
+///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+/// }
+///
+/// #[lamellar::am]
+/// impl LamellarAm for DarcAm {
+///     async fn exec(self) {
+///         let counter_handle = self.counter.read();
+///         let counter = counter_handle.await; // await until we get the read lock
+///         println!("the current counter value on pe {} = {}",lamellar::current_pe,*counter);
+///     }
+///  }
+/// //-------------
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let counter = LocalRwDarc::new(&world, 0).unwrap();
+/// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
+/// let handle = counter.read();
+/// let guard = handle.block(); //block until we get the read lock
+/// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+/// drop(guard); //release the lock
+/// world.wait_all(); // wait for my active message to return
+/// world.barrier(); //at this point all updates will have been performed
+///
+///```
+pub struct LocalRwDarcReadHandle<T: 'static> {
+    darc: LocalRwDarc<T>,
+    #[pin]
+    state: State<T>,
+}
+
+impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
+    pub(crate) fn new(darc: LocalRwDarc<T>) -> Self {
+        Self {
+            darc,
+            state: State::Init,
+        }
+    }
+    /// Used to retrieve the aquired read lock from a LocalRwDarc within a non async context
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.read();
+    /// let guard = handle.block(); //block until we get the read lock
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///
+    ///```
+    pub fn block(self) -> LocalRwDarcReadGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalRwDarcReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        let inner_darc = self.darc.darc.clone();
+
+        let guard = self
+            .darc
+            .darc
+            .team()
+            .clone()
+            .block_on(async move { inner_darc.read_arc().await });
+        LocalRwDarcReadGuard {
+            darc: self.darc,
+            lock: guard,
+        }
+    }
+}
+
+impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
+    type Output = LocalRwDarcReadGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let inner_darc = self.darc.darc.clone();
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Init => {
+                let lock = Box::pin(async move { inner_darc.read_arc().await });
+                *this.state = State::TryingRead(lock);
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            StateProj::TryingRead(lock) => {
+                let guard = ready!(lock.poll(cx));
+                Poll::Ready(LocalRwDarcReadGuard {
+                    darc: this.darc.clone(),
+                    lock: guard,
+                })
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired write lock from a LocalRwDarc
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers or writer currently has access to the lock
+///
+/// Returns an RAII guard which will drop the write access of the wrlock when dropped
+/// # Examples
+///
+///```
+/// use lamellar::darc::prelude::*;
+/// use lamellar::active_messaging::*;
+///
+/// #[lamellar::AmData(Clone)]
+/// struct DarcAm {
+///     counter: LocalRwDarc<usize>, //each pe has a local atomicusize
+/// }
+///
+/// #[lamellar::am]
+/// impl LamellarAm for DarcAm {
+///     async fn exec(self) {
+///         let counter_handle = self.counter.write();
+///         let mut counter = counter_handle.await; // await until we get the write lock
+///         *counter += 1;
+///     }
+///  }
+/// //-------------
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let counter = LocalRwDarc::new(&world, 0).unwrap();
+/// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
+/// let handle = counter.write();
+/// let mut guard = handle.block(); //block until we get the write lock
+/// *guard += my_pe;
+/// drop(guard); //release the lock
+/// world.wait_all(); // wait for my active message to return
+/// world.barrier(); //at this point all updates will have been performed
+///```
+pub struct LocalRwDarcWriteHandle<T: 'static> {
+    darc: LocalRwDarc<T>,
+    #[pin]
+    state: State<T>,
+}
+
+impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
+    pub(crate) fn new(darc: LocalRwDarc<T>) -> Self {
+        Self {
+            darc,
+            state: State::Init,
+        }
+    }
+    /// used to retrieve the aquired write lock from a LocalRwDarc within a non async context
+    ///
+    /// Returns an RAII guard which will drop the write access of the wrlock when dropped
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.write();
+    /// let mut guard = handle.block(); //block until we get the write lock
+    /// *guard += my_pe;
+    ///```
+    pub fn block(self) -> LocalRwDarcWriteGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalRwDarcWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        let inner_darc = self.darc.darc.clone();
+
+        let guard = self
+            .darc
+            .darc
+            .team()
+            .clone()
+            .block_on(async move { inner_darc.write_arc().await });
+        LocalRwDarcWriteGuard {
+            darc: self.darc,
+            lock: guard,
+        }
+    }
+}
+
+impl<T: Sync + Send> Future for LocalRwDarcWriteHandle<T> {
+    type Output = LocalRwDarcWriteGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let inner_darc = self.darc.darc.clone();
+        let mut this = self.project();
+        match this.state.as_mut().project() {
+            StateProj::Init => {
+                let lock = Box::pin(async move { inner_darc.write_arc().await });
+                *this.state = State::TryingWrite(lock);
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            StateProj::TryingWrite(lock) => {
+                let guard = ready!(lock.poll(cx));
+                Poll::Ready(LocalRwDarcWriteGuard {
+                    darc: this.darc.clone(),
+                    lock: guard,
+                })
+            }
+            _ => unreachable!(),
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired read lock from a GlobalRwDarc
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any writer currently has access to the lock, but there may be other readers
+///
+/// Returns an RAII guard which will drop the read access of the wrlock when dropped
+/// # Examples
+///
+///```
+/// use lamellar::darc::prelude::*;
+/// use lamellar::active_messaging::*;
+///
+/// #[lamellar::AmData(Clone)]
+/// struct DarcAm {
+///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
+/// }
+///
+/// #[lamellar::am]
+/// impl LamellarAm for DarcAm {
+///     async fn exec(self) {
+///         let counter_handle = self.counter.read();
+///         let counter = counter_handle.await; // await until we get the write lock
+///         println!("the current counter value on pe {} = {}",lamellar::current_pe,*counter);
+///     }
+///  }
+/// //-------------
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+/// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
+/// let handle = counter.read();
+/// let guard = handle.block(); //block until we get the write lock
+/// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+/// drop(guard); //release the lock
+/// world.wait_all(); // wait for my active message to return
+/// world.barrier(); //at this point all updates will have been performed
+///
+///```
+pub struct GlobalRwDarcReadHandle<T: 'static> {
+    pub(crate) darc: GlobalRwDarc<T>,
+    #[pin]
+    pub(crate) lock_am: AmHandle<()>,
+}
+
+impl<T: Sync + Send> GlobalRwDarcReadHandle<T> {
+    /// Used to retrieve the aquired read lock from a GlobalRwDarc within a non async context
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.read();
+    /// let guard = handle.block(); //block until we get the write lock
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///```
+    pub fn block(self) -> GlobalRwDarcReadGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarcReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        let _ = self.lock_am.blocking_wait();
+        GlobalRwDarcReadGuard {
+            darc: self.darc.clone(),
+            marker: PhantomData,
+            local_cnt: Arc::new(AtomicUsize::new(1)),
+        }
+    }
+}
+
+impl<T: Sync + Send> Future for GlobalRwDarcReadHandle<T> {
+    type Output = GlobalRwDarcReadGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        ready!(this.lock_am.poll(cx));
+        Poll::Ready(GlobalRwDarcReadGuard {
+            darc: this.darc.clone(),
+            marker: PhantomData,
+            local_cnt: Arc::new(AtomicUsize::new(1)),
+        })
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired write lock from a GlobalRwDarc
+///
+/// This handle must be awaited or blocked on to acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers orwriter currently has access to the lock
+///
+/// Returns an RAII guard which will drop the write access of the wrlock when dropped
+/// # Examples
+///
+///```
+/// use lamellar::darc::prelude::*;
+/// use lamellar::active_messaging::*;
+///
+/// #[lamellar::AmData(Clone)]
+/// struct DarcAm {
+///     counter: GlobalRwDarc<usize>, //each pe has a local atomicusize
+/// }
+///
+/// #[lamellar::am]
+/// impl LamellarAm for DarcAm {
+///     async fn exec(self) {
+///         let counter_handle = self.counter.write();
+///         let mut counter = counter_handle.await; // await until we get the write lock
+///         *counter += 1;
+///     }
+///  }
+/// //-------------
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+/// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+/// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
+/// let handle = counter.write();
+/// let mut guard = handle.block(); //block until we get the write lock
+/// *guard += my_pe;
+/// drop(guard); //release the lock
+/// world.wait_all(); // wait for my active message to return
+/// world.barrier(); //at this point all updates will have been performed
+///
+///```
+pub struct GlobalRwDarcWriteHandle<T: 'static> {
+    pub(crate) darc: GlobalRwDarc<T>,
+    #[pin]
+    pub(crate) lock_am: AmHandle<()>,
+}
+
+impl<T: Sync + Send> GlobalRwDarcWriteHandle<T> {
+    /// Used to retrieve the aquired write lock from a GlobalRwDarc within a non async context
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.write();
+    /// let mut guard = handle.block(); //block until we get the write lock
+    /// *guard += my_pe;
+    ///```
+    pub fn block(self) -> GlobalRwDarcWriteGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarcWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        let _ = self.lock_am.blocking_wait();
+        GlobalRwDarcWriteGuard {
+            darc: self.darc.clone(),
+            marker: PhantomData,
+        }
+    }
+}
+
+impl<T: Sync + Send> Future for GlobalRwDarcWriteHandle<T> {
+    type Output = GlobalRwDarcWriteGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        ready!(this.lock_am.poll(cx));
+        Poll::Ready(GlobalRwDarcWriteGuard {
+            darc: this.darc.clone(),
+            marker: PhantomData,
+        })
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// Handle used to retrieve the aquired collective write lock from a GlobalRwDarc
+///
+/// This handle must be awaited or blocked on to actually acquire the lock
+///
+/// Once awaited/blocked the handle will not return while any readers or non collective writer currently has access to the lock.
+/// Further the handle will not return until all PEs have acquired the lock
+///
+/// Returns an RAII guard which will drop the collective write access of the wrlock when dropped
+/// # Examples
+///
+///```
+/// use lamellar::darc::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let my_pe = world.my_pe();
+///
+/// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+/// let handle = counter.collective_write();
+/// let mut guard = handle.block(); // this will block until all PEs have acquired the lock
+/// *guard += my_pe;
+///```
+pub struct GlobalRwDarcCollectiveWriteHandle<T: 'static> {
+    pub(crate) darc: GlobalRwDarc<T>,
+    pub(crate) collective_cnt: usize,
+    #[pin]
+    pub(crate) lock_am: AmHandle<()>,
+}
+
+impl<T: Sync + Send> GlobalRwDarcCollectiveWriteHandle<T> {
+    /// Used to retrieve the aquired collective write lock from a GlobalRwDarc within a non async context
+    ///
+    /// Returns an RAII guard which will drop the read access of the wrlock when dropped
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.collective_write();
+    /// let mut guard = handle.block(); //block until we get the write lock
+    /// *guard += my_pe;
+    pub fn block(self) -> GlobalRwDarcCollectiveWriteGuard<T> {
+        let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarcCollectiveWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+        match config().blocking_call_warning {
+            Some(val) if val => println!("{msg}"),
+            _ => println!("{msg}"),
+        }
+
+        let _ = self.lock_am.blocking_wait();
+        GlobalRwDarcCollectiveWriteGuard {
+            darc: self.darc.clone(),
+            collective_cnt: self.collective_cnt,
+            marker: PhantomData,
+        }
+    }
+}
+
+impl<T: Sync + Send> Future for GlobalRwDarcCollectiveWriteHandle<T> {
+    type Output = GlobalRwDarcCollectiveWriteGuard<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        ready!(this.lock_am.poll(cx));
+        Poll::Ready(GlobalRwDarcCollectiveWriteGuard {
+            darc: this.darc.clone(),
+            collective_cnt: *this.collective_cnt,
+            marker: PhantomData,
+        })
+    }
+}
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 00794cfd..cffaecd9 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -98,6 +98,22 @@ pub enum ExecutorType {
 pub struct LamellarTask<T> {
     #[pin]
     task: LamellarTaskInner<T>,
+    executor: Arc<Executor>,
+}
+
+impl<T> LamellarTask<T> {
+    pub fn block(self) -> T {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            println!(
+                "[LAMELLAR WARNING] trying to call block on within a worker thread {:?} this may result in deadlock.
+                Typically this means you are running within an async context. If you have something like:
+                world.block_on(my_future) you can simply change to my_future.await. If this is not the case,
+                please file an issue on github.",
+                std::backtrace::Backtrace::capture()
+            )
+        }
+        self.executor.clone().block_on(self)
+    }
 }
 
 impl<T> Future for LamellarTask<T> {
@@ -156,7 +172,7 @@ impl<T> Future for LamellarTaskInner<T> {
 
 #[enum_dispatch]
 pub(crate) trait LamellarExecutor {
-    fn spawn_task<F>(&self, future: F) -> LamellarTask<F::Output>
+    fn spawn_task<F>(&self, future: F, executor: Arc<Executor>) -> LamellarTask<F::Output>
     where
         F: Future + Send + 'static,
         F::Output: Send;
@@ -384,7 +400,7 @@ impl Scheduler {
             num_tasks.fetch_sub(1, Ordering::Relaxed);
             result
         };
-        self.executor.spawn_task(future)
+        self.executor.spawn_task(future, self.executor.clone())
     }
 
     pub(crate) fn submit_task<F>(&self, task: F)
diff --git a/src/scheduler/async_std_executor.rs b/src/scheduler/async_std_executor.rs
index 3067cc78..28968a73 100644
--- a/src/scheduler/async_std_executor.rs
+++ b/src/scheduler/async_std_executor.rs
@@ -1,4 +1,6 @@
-use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner};
+use std::sync::Arc;
+
+use crate::scheduler::{Executor, LamellarExecutor, LamellarTask, LamellarTaskInner};
 
 use async_std::task;
 
@@ -10,7 +12,7 @@ pub(crate) struct AsyncStdRt {
 }
 
 impl LamellarExecutor for AsyncStdRt {
-    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    fn spawn_task<F>(&self, task: F, executor: Arc<Executor>) -> LamellarTask<F::Output>
     where
         F: Future + Send + 'static,
         F::Output: Send,
@@ -19,6 +21,7 @@ impl LamellarExecutor for AsyncStdRt {
         let task = task::spawn(task);
         LamellarTask {
             task: LamellarTaskInner::AsyncStdTask(task),
+            executor,
         }
         // })
     }
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index 4a43bd0b..cbe80de3 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -1,4 +1,4 @@
-use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner};
+use crate::scheduler::{Executor, LamellarExecutor, LamellarTask, LamellarTaskInner};
 
 use tokio::runtime::Runtime;
 
@@ -11,7 +11,7 @@ pub(crate) struct TokioRt {
 }
 
 impl LamellarExecutor for TokioRt {
-    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    fn spawn_task<F>(&self, task: F, executor: Arc<Executor>) -> LamellarTask<F::Output>
     where
         F: Future + Send + 'static,
         F::Output: Send,
@@ -20,6 +20,7 @@ impl LamellarExecutor for TokioRt {
         let task = self.rt.spawn(task);
         LamellarTask {
             task: LamellarTaskInner::TokioTask(task),
+            executor,
         }
         // })
     }
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index 4b0ca01b..264d359b 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -1,5 +1,7 @@
 use crate::env_var::config;
-use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus};
+use crate::scheduler::{
+    Executor, LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus,
+};
 
 //use tracing::*;
 
@@ -130,7 +132,7 @@ pub(crate) struct WorkStealing {
 }
 
 impl LamellarExecutor for WorkStealing {
-    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    fn spawn_task<F>(&self, task: F, executor: Arc<Executor>) -> LamellarTask<F::Output>
     where
         F: Future + Send + 'static,
         F::Output: Send,
@@ -145,6 +147,7 @@ impl LamellarExecutor for WorkStealing {
         runnable.schedule();
         LamellarTask {
             task: LamellarTaskInner::LamellarTask(Some(task)),
+            executor,
         }
         // });
     }
diff --git a/src/scheduler/work_stealing2.rs b/src/scheduler/work_stealing2.rs
index 965ca933..26cd0a59 100644
--- a/src/scheduler/work_stealing2.rs
+++ b/src/scheduler/work_stealing2.rs
@@ -1,5 +1,7 @@
 use crate::env_var::config;
-use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus};
+use crate::scheduler::{
+    Executor, LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus,
+};
 use crate::MAIN_THREAD;
 
 //use tracing::*;
@@ -222,7 +224,7 @@ pub(crate) struct WorkStealing2 {
 }
 
 impl LamellarExecutor for WorkStealing2 {
-    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    fn spawn_task<F>(&self, task: F, executor: Arc<Executor>) -> LamellarTask<F::Output>
     where
         F: Future + Send + 'static,
         F::Output: Send,
@@ -236,6 +238,7 @@ impl LamellarExecutor for WorkStealing2 {
         runnable.schedule();
         LamellarTask {
             task: LamellarTaskInner::LamellarTask(Some(task)),
+            executor,
         }
     }
     fn submit_task<F>(&self, task: F)
diff --git a/src/scheduler/work_stealing3.rs b/src/scheduler/work_stealing3.rs
index 5264c710..53538cf0 100644
--- a/src/scheduler/work_stealing3.rs
+++ b/src/scheduler/work_stealing3.rs
@@ -1,5 +1,7 @@
 use crate::env_var::config;
-use crate::scheduler::{LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus};
+use crate::scheduler::{
+    Executor, LamellarExecutor, LamellarTask, LamellarTaskInner, SchedulerStatus,
+};
 use crate::MAIN_THREAD;
 
 //use tracing::*;
@@ -142,7 +144,7 @@ pub(crate) struct WorkStealing3 {
 }
 
 impl LamellarExecutor for WorkStealing3 {
-    fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    fn spawn_task<F>(&self, task: F, executor: Arc<Executor>) -> LamellarTask<F::Output>
     where
         F: Future + Send + 'static,
         F::Output: Send,
@@ -163,6 +165,7 @@ impl LamellarExecutor for WorkStealing3 {
         runnable.schedule();
         LamellarTask {
             task: LamellarTaskInner::LamellarTask(Some(task)),
+            executor,
         }
         // });
     }

From c55d5fed676e9d96fba381962b6daa43223f077a Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 14 Oct 2024 11:25:30 -0700
Subject: [PATCH 097/116] commit build.rs

---
 build.rs | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 build.rs

diff --git a/build.rs b/build.rs
new file mode 100644
index 00000000..d04f36d0
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,20 @@
+#[cfg(feature = "enable-rofi-shared")]
+use std::env;
+#[cfg(feature = "enable-rofi-shared")]
+use std::path::PathBuf;
+
+fn main() {
+    #[cfg(feature = "enable-rofi-shared")]
+    {
+        if let Ok(rofi_lib_dir) = env::var("DEP_ROFI_ROOT") {
+            let lib_path = PathBuf::from(rofi_lib_dir).join("lib");
+            println!("cargo:rustc-link-search=native={}", lib_path.display());
+            println!("cargo:rustc-link-arg=-Wl,-rpath,{}", lib_path.display());
+        } else {
+            panic!(
+                "unable to set rofi backend, recompile with 'enable-rofi' feature {:?}",
+                env::vars()
+            )
+        }
+    }
+}

From fa0c64f063c2df13515e61e12991c8e1dda9f07e Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 16 Oct 2024 10:36:02 -0700
Subject: [PATCH 098/116] convert darc into methods to handle based rather than
 explict blocking apis

---
 examples/active_message_examples/am_local.rs  |   8 +-
 .../active_message_examples/recursive_am.rs   |   2 +-
 examples/array_examples/global_lock_array.rs  |   9 +-
 .../global_lock_atomic_array_put_bw.rs        |   2 +-
 .../local_lock_atomic_array_put_bw.rs         |   2 +-
 examples/darc_examples/darc.rs                |   6 +-
 examples/kernels/am_flops.rs                  |   2 +-
 examples/kernels/cached_am_gemm.rs            |   2 +-
 examples/kernels/dft_proxy.rs                 |   6 +-
 examples/misc/dist_hashmap.rs                 |   2 +-
 src/array.rs                                  |   7 +-
 src/darc.rs                                   | 204 +++----------
 src/darc/global_rw_darc.rs                    | 194 +++---------
 src/darc/handle.rs                            | 277 +++++++++++++++++-
 src/darc/local_rw_darc.rs                     | 238 +++------------
 src/scheduler/tokio_executor.rs               |   1 +
 tests/array/arithmetic_ops/add_test.rs        |   6 +-
 tests/array/arithmetic_ops/div_test.rs        |   2 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |   9 +-
 tests/array/arithmetic_ops/fetch_div_test.rs  |   2 +-
 tests/array/arithmetic_ops/fetch_mul_test.rs  |   2 +-
 tests/array/arithmetic_ops/fetch_rem_test.rs  |   2 +-
 tests/array/arithmetic_ops/fetch_sub_test.rs  |   2 +-
 tests/array/arithmetic_ops/mul_test.rs        |   2 +-
 tests/array/arithmetic_ops/rem_test.rs        |   2 +-
 tests/array/arithmetic_ops/sub_test.rs        |   2 +-
 tests/array/atomic_ops/load_store_test.rs     |   2 +-
 tests/array/bitwise_ops/and_test.rs           |   2 +-
 tests/array/bitwise_ops/fetch_and_test.rs     |   2 +-
 tests/array/bitwise_ops/fetch_or_test.rs      |   2 +-
 tests/array/bitwise_ops/fetch_xor_test.rs     |   2 +-
 tests/array/bitwise_ops/or_test.rs            |   2 +-
 tests/array/bitwise_ops/xor_test.rs           |   2 +-
 tests/array/rdma/put_test.rs                  |   2 +-
 34 files changed, 433 insertions(+), 576 deletions(-)

diff --git a/examples/active_message_examples/am_local.rs b/examples/active_message_examples/am_local.rs
index f7ca8240..10f2150d 100644
--- a/examples/active_message_examples/am_local.rs
+++ b/examples/active_message_examples/am_local.rs
@@ -113,18 +113,18 @@ fn main() {
         println!("-----------------------------------");
         //     println!("---------------------------------------------------------------");
         //     println!("Testing local am no return");
-        //     let res = world.exec_am_pe(my_pe, am.clone()).blocking_wait();
+        //     let res = world.exec_am_pe(my_pe, am.clone()).block();
         //     assert_eq!(res, None);
         //     println!("no return result: {:?}", res);
         //     println!("-----------------------------------");
         //     println!("Testing remote am no return");
-        //     let res = world.exec_am_pe(num_pes - 1, am.clone()).blocking_wait();
+        //     let res = world.exec_am_pe(num_pes - 1, am.clone()).block();
         //     assert_eq!(res, None);
         //     println!("no return result: {:?}", res);
         //     println!("-----------------------------------");
         //     println!("Testing all am no return");
         //     println!("[{:?}] exec on all", my_pe);
-        //     let res = world.exec_am_all(am.clone()).blocking_wait();
+        //     let res = world.exec_am_all(am.clone()).block();
         //     assert!(res.iter().all(|x| x.is_none()));
         //     println!("no return result: {:?}", res);
         //     println!("---------------------------------------------------------------");
@@ -132,7 +132,7 @@ fn main() {
 
     // println!("---------------------------------------------------------------");
     // println!("Testing ring pattern am no return");
-    // let res = world.exec_am_pe((my_pe + 1) % num_pes, am.clone()).blocking_wait();
+    // let res = world.exec_am_pe((my_pe + 1) % num_pes, am.clone()).block();
     // assert_eq!(res, None);
     // println!("no return result: {:?}", res);
     // println!("-----------------------------------");
diff --git a/examples/active_message_examples/recursive_am.rs b/examples/active_message_examples/recursive_am.rs
index d6bdbc85..49e8f1c4 100644
--- a/examples/active_message_examples/recursive_am.rs
+++ b/examples/active_message_examples/recursive_am.rs
@@ -43,7 +43,7 @@ impl LamellarAM for RecursiveAM {
                     orig: self.orig,
                 },
             );
-            // let mut res = next.blocking_wait().expect("error returning from am"); // this will cause deadlock
+            // let mut res = next.block().expect("error returning from am"); // this will cause deadlock
             let mut res = next.await;
             res.push(hostname::get().unwrap().into_string().unwrap()); //append my host name to list returned from previous call
             res
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 58483778..dc18ee1f 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -9,7 +9,7 @@ fn main() {
     let array = GlobalLockArray::<usize>::new(&world, 100, Distribution::Block);
 
     let s = Instant::now();
-    let local_data = array.blocking_read_local_data();
+    let local_data = array.read_local_data().block();
     println!(
         "PE{my_pe} time: {:?} {:?}",
         s.elapsed().as_secs_f64(),
@@ -19,7 +19,7 @@ fn main() {
     drop(local_data); //release the lock
 
     world.barrier();
-    let mut local_data = array.blocking_write_local_data();
+    let mut local_data = array.write_local_data().block();
     println!(
         "PE{my_pe} time: {:?} got write lock",
         s.elapsed().as_secs_f64()
@@ -31,7 +31,7 @@ fn main() {
     array.print();
     println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
-    let mut local_data = array.blocking_collective_write_local_data();
+    let mut local_data = array.collective_write_local_data().block();
     println!(
         "PE{my_pe} time: {:?} got collective write lock",
         s.elapsed().as_secs_f64()
@@ -48,7 +48,8 @@ fn main() {
     println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
     array
-        .blocking_read_lock()
+        .read_lock()
+        .block()
         .dist_iter()
         .enumerate()
         .for_each(move |(i, elem)| {
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index 919521f2..248b57f7 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -67,7 +67,7 @@ fn main() {
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == 0 {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.blocking_read_local_data();
+                let local_data = array.read_local_data().block();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 673aa22a..1b857e1f 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -67,7 +67,7 @@ fn main() {
         let cur_t = timer.elapsed().as_secs_f64();
         if my_pe == num_pes - 1 {
             for j in (0..2_u64.pow(exp) as usize).step_by(num_bytes as usize) {
-                let local_data = array.blocking_read_local_data();
+                let local_data = array.read_local_data().block();
                 while *(&local_data[(j + num_bytes as usize) - 1]) == 255 as u8 {
                     println!(
                         "this should not happen {:?}",
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index 08beb185..3a9215b8 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -56,10 +56,10 @@ fn main() {
     ));
 
     let global_darc = GlobalRwDarc::new(world.team(), 0).unwrap();
-    let read_lock = global_darc.blocking_read();
+    let read_lock = global_darc.read().block();
     println!("I have the read lock!!!! {:?}", my_pe);
     drop(read_lock);
-    let write_lock = global_darc.blocking_write();
+    let write_lock = global_darc.write().block();
     println!("I have the write lock!!!! {:?}", my_pe);
     std::thread::sleep(std::time::Duration::from_secs(1));
     drop(write_lock);
@@ -100,7 +100,7 @@ fn main() {
             tg.add_am_all(darc_am);
             team.block_on(tg.exec());
         } else {
-            *local_darc.blocking_write() += 1;
+            *local_darc.write().block() += 1;
         }
     }
     // --------
diff --git a/examples/kernels/am_flops.rs b/examples/kernels/am_flops.rs
index 7783f39b..2e975cc8 100644
--- a/examples/kernels/am_flops.rs
+++ b/examples/kernels/am_flops.rs
@@ -150,7 +150,7 @@ fn main() {
         //     let cur_t = timer.elapsed().as_secs_f64();
         //     let tot_flop: usize = reqs
         //         .iter()
-        //         .map(|r| r.blocking_wait().iter().map(|r| r.unwrap()).sum::<usize>())
+        //         .map(|r| r.block().iter().map(|r| r.unwrap()).sum::<usize>())
         //         .sum();
         //     let task_granularity = ((cur_t * 24f64) / num_tasks as f64) * 1000.0f64;
         //     if my_pe == 0 {
diff --git a/examples/kernels/cached_am_gemm.rs b/examples/kernels/cached_am_gemm.rs
index b0c9c7d5..4e5c7567 100644
--- a/examples/kernels/cached_am_gemm.rs
+++ b/examples/kernels/cached_am_gemm.rs
@@ -255,7 +255,7 @@ fn main() {
                 tasks += 1;
             }
             // for req in reqs {
-            //     req.blocking_wait();
+            //     req.block();
             // }
         }
 
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index e6b56040..3980f7ad 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -786,7 +786,7 @@ fn main() {
             //     println!(
             //         "{:?} array sum: {:?} time: {:?}",
             //         my_pe,
-            //         full_spectrum_array.sum().blocking_wait(),
+            //         full_spectrum_array.sum().block(),
             //         time
             //     );
             // }
@@ -807,7 +807,7 @@ fn main() {
             //     println!(
             //         "{:?} array sum: {:?} time: {:?}",
             //         my_pe,
-            //         full_spectrum_array.sum().blocking_wait(),
+            //         full_spectrum_array.sum().block(),
             //         time
             //     );
             // }
@@ -857,7 +857,7 @@ fn main() {
             //     println!(
             //         "{:?} array sum: {:?} time: {:?}",
             //         my_pe,
-            //         full_spectrum_array.sum().blocking_wait(),
+            //         full_spectrum_array.sum().block(),
             //         time
             //     );
             // }
diff --git a/examples/misc/dist_hashmap.rs b/examples/misc/dist_hashmap.rs
index e04d93c8..f442a1f4 100644
--- a/examples/misc/dist_hashmap.rs
+++ b/examples/misc/dist_hashmap.rs
@@ -112,6 +112,6 @@ fn main() {
     world.barrier();
     println!(
         "[{my_pe}] local data: {:?}",
-        distributed_map.data.blocking_read()
+        distributed_map.data.read().block()
     );
 }
diff --git a/src/array.rs b/src/array.rs
index b4f68a68..63e3ab12 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -194,8 +194,9 @@ crate::inventory::collect!(ReduceKey);
 lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
 lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
 
-lamellar_impl::generate_reductions_for_type_rt!(false, f32);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
+
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
@@ -212,7 +213,7 @@ lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-lamellar_impl::generate_ops_for_bool_rt!();
+// lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
diff --git a/src/darc.rs b/src/darc.rs
index 975decb7..0bcd1d60 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -78,6 +78,8 @@ pub(crate) mod global_rw_darc;
 use global_rw_darc::DistRwLock;
 pub use global_rw_darc::GlobalRwDarc;
 
+use self::handle::{IntoGlobalRwDarcHandle, IntoLocalRwDarcHandle};
+
 pub(crate) mod handle;
 
 static DARC_ID: AtomicUsize = AtomicUsize::new(0);
@@ -949,7 +951,7 @@ impl<T> Darc<T> {
         // the_darc.print();
         weak
     }
-    fn inner(&self) -> &DarcInner<T> {
+    pub(crate) fn inner(&self) -> &DarcInner<T> {
         unsafe { self.inner.as_ref().expect("invalid darc inner ptr") }
     }
     fn inner_mut(&self) -> &mut DarcInner<T> {
@@ -1378,13 +1380,14 @@ impl<T> Darc<T> {
     #[doc(alias = "Collective")]
     /// Converts this Darc into a [LocalRwDarc]
     ///
-    /// This is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+    /// This returns a handle (which is Future) thats needs to be `awaited` or `blocked` on to perform the operation.
+    /// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
     ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
     /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    /// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
     ///
     /// # Examples
     /// ```
@@ -1395,100 +1398,32 @@ impl<T> Darc<T> {
     /// let five = Darc::new(&world,5).expect("PE in world team");
     /// let five_as_localdarc = world.block_on(async move {five.into_localrw().await});
     /// ```
-    pub async fn into_localrw(self) -> LocalRwDarc<T> {
-        let inner = self.inner();
-        let _cur_pe = inner.team().world_pe;
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
-            },
-            DarcMode::LocalRw,
-            0,
-        )
-        .await;
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
-        // println! {"[{:?}] darc[{:?}] into_localrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
-        let item = unsafe { *Box::from_raw(inner.item as *mut T) };
-
-        let d = Darc {
-            inner: self.inner as *mut DarcInner<Arc<RwLock<T>>>,
-            src_pe: self.src_pe,
+    pub fn into_localrw(self) -> IntoLocalRwDarcHandle<T> {
+        
+        let wrapped_inner = WrappedInner {
+            inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
         };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
-        // d.print();
-        LocalRwDarc { darc: d }
-    }
-
-    #[doc(alias = "Collective")]
-    /// Converts this Darc into a [LocalRwDarc]
-    ///
-    /// This is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
-    ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
-    ///
-    /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    ///
-    /// let five = Darc::new(&world,5).expect("PE in world team");
-    /// let five_as_localdarc = five.blocking_into_localrw();
-    /// ```
-    pub fn blocking_into_localrw(self) -> LocalRwDarc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `Darc::blocking_into_localrw` from within an async context which may lead to deadlock, it is recommended that you use `into_localrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("{msg}");
-                }
-            } else {
-                println!("{msg}");
-            }
+        let team = self.inner().team().clone();
+        IntoLocalRwDarcHandle {
+            darc: self.into(),
+            team,
+            outstanding_future: Box::pin(async move {
+                DarcInner::block_on_outstanding(wrapped_inner, DarcMode::LocalRw, 0).await;
+            }),
         }
-        let inner = self.inner();
-        let _cur_pe = inner.team().world_pe;
-        inner.team().block_on(DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
-            },
-            DarcMode::LocalRw,
-            0,
-        ));
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
-        // println! {"[{:?}] darc[{:?}] into_localrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
-        let item = unsafe { *Box::from_raw(inner.item as *mut T) };
-
-        let d = Darc {
-            inner: self.inner as *mut DarcInner<Arc<RwLock<T>>>,
-            src_pe: self.src_pe,
-        };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
-        // d.print();
-        LocalRwDarc { darc: d }
     }
 
     #[doc(alias = "Collective")]
     /// Converts this Darc into a [GlobalRwDarc]
     ///
-    /// This is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+    /// This returns a handle (which is Future) thats needs to be `awaited` or `blocked` on to perform the operation.
+    /// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
     ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a Darc simultaneously (on any PE).
+    /// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    /// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
     ///
     /// # Examples
     /// ```
@@ -1497,91 +1432,20 @@ impl<T> Darc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = Darc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = world.block_on(async move {five.into_globalrw().await});
+    /// let five_as_globaldarc = five.into_globalrw().block();
     /// ```
-    pub async fn into_globalrw(self) -> GlobalRwDarc<T> {
-        let inner = self.inner();
-        let _cur_pe = inner.team().world_pe;
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
-            },
-            DarcMode::GlobalRw,
-            0,
-        )
-        .await;
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
-        // println! {"[{:?}] darc[{:?}] into_globalrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
-
-        let item = unsafe { Box::from_raw(inner.item as *mut T) };
-        let d = Darc {
-            inner: self.inner as *mut DarcInner<DistRwLock<T>>,
-            src_pe: self.src_pe,
+    pub fn into_globalrw(self) -> IntoGlobalRwDarcHandle<T> {
+        let wrapped_inner = WrappedInner {
+            inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
         };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(DistRwLock::new(
-                *item,
-                self.inner().team(),
-            ))));
-        GlobalRwDarc { darc: d }
-    }
-
-    #[doc(alias = "Collective")]
-    /// Converts this Darc into a [GlobalRwDarc]
-    ///
-    /// This is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
-    ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a Darc simultaneously (on any PE).
-    ///
-    /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    ///
-    /// let five = Darc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = five.blocking_into_globalrw();
-    /// ```
-    pub fn blocking_into_globalrw(self) -> GlobalRwDarc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `Darc::blocking_into_globalrw` from within an async context which may lead to deadlock, it is recommended that you use `into_globalrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
+        let team = self.inner().team().clone();
+        IntoGlobalRwDarcHandle {
+            darc: self.into(),
+            team,
+            outstanding_future: Box::pin(async move {
+                DarcInner::block_on_outstanding(wrapped_inner, DarcMode::GlobalRw, 0).await;
+            }),
         }
-        let inner = self.inner();
-        let _cur_pe = inner.team().world_pe;
-        inner.team().block_on(DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
-            },
-            DarcMode::GlobalRw,
-            0,
-        ));
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        inner.total_local_cnt.fetch_add(1, Ordering::SeqCst);
-        // println! {"[{:?}] darc[{:?}] into_globalrw {:?} {:?} {:?}",std::thread::current().id(),self.inner().id,self.inner,self.inner().local_cnt.load(Ordering::SeqCst),self.inner().total_local_cnt.load(Ordering::SeqCst)};
-
-        let item = unsafe { Box::from_raw(inner.item as *mut T) };
-        let d = Darc {
-            inner: self.inner as *mut DarcInner<DistRwLock<T>>,
-            src_pe: self.src_pe,
-        };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(DistRwLock::new(
-                *item,
-                self.inner().team(),
-            ))));
-        GlobalRwDarc { darc: d }
     }
 }
 
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 45183b0a..895cfc40 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -18,6 +18,7 @@ use crate::{IdError, LamellarEnv, LamellarTeam};
 
 use super::handle::{
     GlobalRwDarcCollectiveWriteHandle, GlobalRwDarcReadHandle, GlobalRwDarcWriteHandle,
+    IntoDarcHandle, IntoLocalRwDarcHandle,
 };
 
 #[derive(serde::Serialize, serde::Deserialize, Debug)]
@@ -62,7 +63,7 @@ impl<T> DistRwLock<T> {
             data: std::cell::UnsafeCell::new(data),
         }
     }
-    fn into_inner(self) -> T {
+    pub(crate) fn into_inner(self) -> T {
         self.data.into_inner()
     }
 }
@@ -482,7 +483,7 @@ impl<T> crate::active_messaging::DarcSerde for GlobalRwDarc<T> {
 }
 
 impl<T> GlobalRwDarc<T> {
-    fn inner(&self) -> &DarcInner<DistRwLock<T>> {
+    pub(crate) fn inner(&self) -> &DarcInner<DistRwLock<T>> {
         self.darc.inner()
     }
 
@@ -751,13 +752,14 @@ impl<T> GlobalRwDarc<T> {
     #[doc(alias = "Collective")]
     /// Converts this GlobalRwDarc into a regular [Darc]
     ///
-    /// This is a blocking collective call amongst all PEs in the GlobalRwDarc's team, only returning once every PE in the team has completed the call.
+    /// This returns a handle (which is Future) thats needs to be `awaited` or `blocked` on to perform the operation.
+    /// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
     ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a Darc and a GlobalRwDarc simultaneously (on any PE).
+    /// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    /// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
     ///
     /// # Examples
     /// ```
@@ -766,94 +768,34 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = world.block_on(async move {five.into_darc()});
+    /// let five_as_darc = five.into_darc().block();
     /// ```
-    pub async fn into_darc(self) -> Darc<T> {
-        let inner = self.inner();
-        // println!("into_darc");
-        // self.print();
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::Darc,
-            0,
-        )
-        .await;
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<T>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
+    pub fn into_darc(self) -> IntoDarcHandle<T> {
+        let wrapped_inner = WrappedInner {
+            inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                .expect("invalid darc pointer"),
         };
-        d.inner_mut().update_item(Box::into_raw(Box::new(item)));
-        d
-    }
-    #[doc(alias = "Collective")]
-    /// Converts this GlobalRwDarc into a regular [Darc]
-    ///
-    /// This is a blocking collective call amongst all PEs in the GlobalRwDarc's team, only returning once every PE in the team has completed the call.
-    ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a Darc and a GlobalRwDarc simultaneously (on any PE).
-    ///
-    /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    ///
-    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = five.into_darc();
-    /// ```
-    pub fn blocking_into_darc(self) -> Darc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_into_darc` from within an async context which may lead to deadlock, it is recommended that you use `into_darc().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
+        let team = self.darc.inner().team().clone();
+        IntoDarcHandle {
+            darc: self.into(),
+            team,
+            outstanding_future: Box::pin(async move {
+                DarcInner::block_on_outstanding(wrapped_inner, DarcMode::Darc, 0).await;
+            }),
         }
-        let inner = self.inner();
-        // println!("into_darc");
-        // self.print();
-        inner.team().block_on(DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::Darc,
-            0,
-        ));
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<T>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
-        };
-        d.inner_mut().update_item(Box::into_raw(Box::new(item)));
-        d
     }
 
     #[doc(alias = "Collective")]
     /// Converts this GlobalRwDarc into a [LocalRwDarc]
     ///
-    /// This is a blocking collective call amongst all PEs in the GlobalRwDarc's team, only returning once every PE in the team has completed the call.
+    /// This returns a handle (which is Future) thats needs to be `awaited` or `blocked` on to perform the operation.
+    /// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
     ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
+    /// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    /// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
     ///
     /// # Examples
     /// ```
@@ -864,83 +806,19 @@ impl<T> GlobalRwDarc<T> {
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_localdarc = world.block_on(async move {five.into_localrw()});
     /// ```
-    pub async fn into_localrw(self) -> LocalRwDarc<T> {
-        let inner = self.inner();
-        // println!("into_localrw");
-        // self.print();
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::LocalRw,
-            0,
-        )
-        .await;
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<T>>>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
+    pub fn into_localrw(self) -> IntoLocalRwDarcHandle<T> {
+        let wrapped_inner = WrappedInner {
+            inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                .expect("invalid darc pointer"),
         };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
-        LocalRwDarc { darc: d }
-    }
-
-    #[doc(alias = "Collective")]
-    /// Converts this GlobalRwDarc into a [LocalRwDarc]
-    ///
-    /// This is a blocking collective call amongst all PEs in the GlobalRwDarc's team, only returning once every PE in the team has completed the call.
-    ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
-    ///
-    /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    ///
-    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_localdarc = five.into_localrw();
-    /// ```
-    pub fn blocking_into_localrw(self) -> LocalRwDarc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarc::blocking_into_localrw` from within an async context which may lead to deadlock, it is recommended that you use `into_localrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
+        let team = self.darc.inner().team().clone();
+        IntoLocalRwDarcHandle {
+            darc: self.into(),
+            team,
+            outstanding_future: Box::pin(async move {
+                DarcInner::block_on_outstanding(wrapped_inner, DarcMode::LocalRw, 0).await;
+            }),
         }
-        let inner = self.inner();
-        // println!("into_localrw");
-        // self.print();
-        inner.team().block_on(DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::LocalRw,
-            0,
-        ));
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let item = unsafe { Box::from_raw(inner.item as *mut DistRwLock<T>).into_inner() };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<Arc<RwLock<T>>>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
-        };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
-        LocalRwDarc { darc: d }
     }
 }
 
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
index db1b4274..46acb340 100644
--- a/src/darc/handle.rs
+++ b/src/darc/handle.rs
@@ -6,17 +6,18 @@ use std::task::{Context, Poll};
 
 use crate::darc::local_rw_darc::{LocalRwDarc, LocalRwDarcReadGuard};
 use crate::lamellar_request::LamellarRequest;
-use crate::AmHandle;
-use crate::{config, GlobalRwDarc};
+use crate::{config, darc, GlobalRwDarc, LamellarTeamRT};
+use crate::{AmHandle, Darc};
 
-use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
+use async_lock::{RwLock, RwLockReadGuardArc, RwLockWriteGuardArc};
 use futures_util::{ready, Future};
 use pin_project::pin_project;
 
 use super::global_rw_darc::{
-    GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
+    DistRwLock, GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
 };
 use super::local_rw_darc::LocalRwDarcWriteGuard;
+use super::DarcInner;
 
 #[pin_project(project = StateProj)]
 enum State<T> {
@@ -529,3 +530,271 @@ impl<T: Sync + Send> Future for GlobalRwDarcCollectiveWriteHandle<T> {
         })
     }
 }
+
+pub(crate) enum OrigDarc<T: 'static> {
+    Darc(Darc<T>),
+    LocalRw(LocalRwDarc<T>),
+    GlobalRw(GlobalRwDarc<T>),
+}
+
+impl<T> From<Darc<T>> for OrigDarc<T> {
+    fn from(darc: Darc<T>) -> Self {
+        OrigDarc::Darc(darc)
+    }
+}
+
+impl<T> From<LocalRwDarc<T>> for OrigDarc<T> {
+    fn from(darc: LocalRwDarc<T>) -> Self {
+        OrigDarc::LocalRw(darc)
+    }
+}
+
+impl<T> From<GlobalRwDarc<T>> for OrigDarc<T> {
+    fn from(darc: GlobalRwDarc<T>) -> Self {
+        OrigDarc::GlobalRw(darc)
+    }
+}
+
+impl<T: 'static> OrigDarc<T> {
+    fn inc_local_cnt(&self) {
+        match self {
+            OrigDarc::Darc(darc) => darc.inc_local_cnt(1),
+            OrigDarc::LocalRw(darc) => darc.darc.inc_local_cnt(1),
+            OrigDarc::GlobalRw(darc) => darc.darc.inc_local_cnt(1),
+        }
+    }
+    fn inner<N>(&self) -> *mut DarcInner<N> {
+        match self {
+            OrigDarc::Darc(darc) => darc.inner_mut() as *mut _ as *mut DarcInner<N>,
+            OrigDarc::LocalRw(darc) => darc.darc.inner_mut() as *mut _ as *mut DarcInner<N>,
+            OrigDarc::GlobalRw(darc) => darc.darc.inner_mut() as *mut _ as *mut DarcInner<N>,
+        }
+    }
+    fn src_pe(&self) -> usize {
+        match self {
+            OrigDarc::Darc(darc) => darc.src_pe,
+            OrigDarc::LocalRw(darc) => darc.darc.src_pe,
+            OrigDarc::GlobalRw(darc) => darc.darc.src_pe,
+        }
+    }
+    unsafe fn get_item(&self) -> T {
+        match self {
+            OrigDarc::Darc(darc) => *Box::from_raw(darc.inner().item as *mut T),
+            OrigDarc::LocalRw(darc) => {
+                let mut arc_item =
+                    (*Box::from_raw(darc.inner().item as *mut Arc<RwLock<T>>)).clone();
+                let item: T = loop {
+                    arc_item = match Arc::try_unwrap(arc_item) {
+                        Ok(item) => break item.into_inner(),
+                        Err(arc_item) => arc_item,
+                    };
+                    std::thread::yield_now();
+                };
+                item
+            }
+            OrigDarc::GlobalRw(darc) => {
+                Box::from_raw(darc.inner().item as *mut DistRwLock<T>).into_inner()
+            }
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a [LocalRwDarc] or [GlobalRwDarc] into a regular [Darc].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::darc::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+/// let five_as_darc = five.into_darc().block();
+/// /* alternatively something like the following is valid as well
+/// let five_as_darc = world.block_on(async move{
+///     five.into_darc().await;
+/// })
+///  */
+/// ```
+pub struct IntoDarcHandle<T: 'static> {
+    pub(crate) darc: OrigDarc<T>,
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = ()> + Send>>,
+}
+
+impl<T: Sync + Send> IntoDarcHandle<T> {
+    /// Used to drive to conversion of a [LocalRwDarc] or [GlobalRwDarc] into a [Darc]
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_darc = five.into_darc().block();
+    pub fn block(self) -> Darc<T> {
+        self.team.clone().block_on(self)
+    }
+}
+
+impl<T: Sync + Send> Future for IntoDarcHandle<T> {
+    type Output = Darc<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        ready!(this.outstanding_future.as_mut().poll(cx));
+        this.darc.inc_local_cnt();
+        let item = unsafe { this.darc.get_item() };
+        let darc: Darc<T> = Darc {
+            inner: this.darc.inner(),
+            src_pe: this.darc.src_pe(),
+        };
+        darc.inner_mut().update_item(Box::into_raw(Box::new(item)));
+        Poll::Ready(darc)
+    }
+}
+
+#[must_use]
+#[pin_project]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a [Darc] or [GlobalRwDarc] into a [LocalRwDarc].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::darc::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+/// let five_as_localrw = five.into_localrw().block();
+/// /* alternatively something like the following is valid as well
+/// let five_as_localrw = world.block_on(async move{
+///     five.into_localrw().await;
+/// })
+///  */
+/// ```
+pub struct IntoLocalRwDarcHandle<T: 'static> {
+    pub(crate) darc: OrigDarc<T>,
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = ()> + Send>>,
+}
+
+impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
+    /// Used to drive to conversion of a [Darc] or [GlobalRwDarc] into a [LocalRwDarc] 
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_localrw = five.into_localrw().block();
+    pub fn block(self) -> LocalRwDarc<T> {
+        self.team.clone().block_on(self)
+    }
+}
+
+impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
+    type Output = LocalRwDarc<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        ready!(this.outstanding_future.as_mut().poll(cx));
+        this.darc.inc_local_cnt();
+        let item = unsafe { this.darc.get_item() };
+        let darc: Darc<Arc<RwLock<T>>> = Darc {
+            inner: this.darc.inner(),
+            src_pe: this.darc.src_pe(),
+        };
+        darc.inner_mut()
+            .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
+        Poll::Ready(LocalRwDarc { darc })
+    }
+}
+
+#[must_use]
+#[pin_project]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a [Darc] or [LocalRwDarc] into a [GlobalRwDarc].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::darc::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+/// let five_as_globalrw = five.into_globalrw().block();
+/// /* alternatively something like the following is valid as well
+/// let five_as_globalrw = world.block_on(async move{
+///     five.into_globalrw().await;
+/// })
+///  */
+/// ```
+pub struct IntoGlobalRwDarcHandle<T: 'static> {
+    pub(crate) darc: OrigDarc<T>,
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = ()> + Send>>,
+}
+
+impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
+    /// Used to drive to conversion of a  [Darc] or [LocalRwDarc] into a [GlobalRwDarc]
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_globalrw = five.into_globalrw().block();
+    pub fn block(self) -> GlobalRwDarc<T> {
+        self.team.clone().block_on(self)
+    }
+}
+
+impl<T: Sync + Send> Future for IntoGlobalRwDarcHandle<T> {
+    type Output = GlobalRwDarc<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let mut this = self.project();
+        ready!(this.outstanding_future.as_mut().poll(cx));
+        this.darc.inc_local_cnt();
+        let item = unsafe { this.darc.get_item() };
+        let darc: Darc<DistRwLock<T>> = Darc {
+            inner: this.darc.inner(),
+            src_pe: this.darc.src_pe(),
+        };
+        darc.inner_mut()
+            .update_item(Box::into_raw(Box::new(DistRwLock::new(
+                item,
+                this.team.clone(),
+            ))));
+        Poll::Ready(GlobalRwDarc { darc })
+    }
+}
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index c4c2dbbe..a315d9e4 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -27,7 +27,9 @@ use crate::lamellar_team::IntoLamellarTeam;
 use crate::scheduler::LamellarTask;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
-use super::handle::{LocalRwDarcReadHandle, LocalRwDarcWriteHandle};
+use super::handle::{
+    IntoDarcHandle, IntoGlobalRwDarcHandle, LocalRwDarcReadHandle, LocalRwDarcWriteHandle,
+};
 
 #[derive(Debug)]
 pub struct LocalRwDarcReadGuard<T: 'static> {
@@ -160,7 +162,7 @@ impl<T> crate::active_messaging::DarcSerde for LocalRwDarc<T> {
 }
 
 impl<T> LocalRwDarc<T> {
-    fn inner(&self) -> &DarcInner<Arc<RwLock<T>>> {
+    pub(crate) fn inner(&self) -> &DarcInner<Arc<RwLock<T>>> {
         self.darc.inner()
     }
 
@@ -206,7 +208,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// Creates a handle for aquiring a reader lock of this LocalRwDarc local to this PE.
     /// The returned handle must either be await'd `.read().await` within an async context
     /// or it must be blocked on `.read().block()` in a non async context to actually acquire the lock
-    /// 
+    ///
     /// After awaiting or blocking on the handle, a RAII guard is returned which will drop the read access of the wrlock when dropped
     ///
     /// # One-sided Operation
@@ -248,7 +250,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// Creates a handle for aquiring a writer lock of this LocalRwDarc local to this PE.
     /// The returned handle must either be await'd `.write().await` within an async context
     /// or it must be blocked on `.write().block()` in a non async context to actually acquire the lock
-    /// 
+    ///
     /// After awaiting or blocking on the handle, a RAII guard is returned which will drop the write access of the wrlock when dropped
     ///
     /// # One-sided Operation
@@ -327,13 +329,14 @@ impl<T> LocalRwDarc<T> {
     #[doc(alias = "Collective")]
     /// Converts this LocalRwDarc into a [GlobalRwDarc]
     ///
-    /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
+    /// This returns a handle (which is Future) thats needs to be `awaited` or `blocked` on to perform the operation.
+    /// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
     ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
+    /// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    /// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
     ///
     /// # Examples
     /// ```
@@ -344,103 +347,21 @@ impl<T> LocalRwDarc<T> {
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_globaldarc = world.block_on(async move {five.into_globalrw().await});
     /// ```
-    pub async fn into_globalrw(self) -> GlobalRwDarc<T> {
-        let inner = self.inner();
-        // println!("into_darc");
-        // self.print();
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::GlobalRw,
-            0,
-        )
-        .await;
-        // println!("after block on outstanding");
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
-        let item: T = loop {
-            arc_item = match Arc::try_unwrap(arc_item) {
-                Ok(item) => break item.into_inner(),
-                Err(arc_item) => arc_item,
-            };
-        };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
+    pub fn into_globalrw(self) -> IntoGlobalRwDarcHandle<T> {
+        let wrapped_inner = WrappedInner {
+            inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                .expect("invalid darc pointer"),
         };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(DistRwLock::new(
-                item,
-                self.inner().team(),
-            ))));
-        GlobalRwDarc { darc: d }
-    }
-
-    #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a [GlobalRwDarc]
-    ///
-    /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
-    ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a GlobalRwDarc and a LocalRwDarc simultaneously (on any PE).
-    ///
-    /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    ///
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_globaldarc = five.blocking_into_globalrw();
-    /// ```
-    pub fn blocking_into_globalrw(self) -> GlobalRwDarc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_into_globalrw` from within an async context which may lead to deadlock, it is recommended that you use `into_globalrw().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
+        let team = self.darc.inner().team().clone();
+        IntoGlobalRwDarcHandle {
+            darc: self.into(),
+            team,
+            outstanding_future: Box::pin(DarcInner::block_on_outstanding(
+                wrapped_inner,
+                DarcMode::GlobalRw,
+                0,
+            )),
         }
-        let inner = self.inner();
-        // println!("into_darc");
-        // self.print();
-        inner.team().block_on(DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::GlobalRw,
-            0,
-        ));
-        // println!("after block on outstanding");
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
-        let item: T = loop {
-            arc_item = match Arc::try_unwrap(arc_item) {
-                Ok(item) => break item.into_inner(),
-                Err(arc_item) => arc_item,
-            };
-        };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<DistRwLock<T>>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
-        };
-        d.inner_mut()
-            .update_item(Box::into_raw(Box::new(DistRwLock::new(
-                item,
-                self.inner().team(),
-            ))));
-        GlobalRwDarc { darc: d }
     }
 }
 
@@ -448,13 +369,14 @@ impl<T: Send + Sync> LocalRwDarc<T> {
     #[doc(alias = "Collective")]
     /// Converts this LocalRwDarc into a regular [Darc]
     ///
-    /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
+    /// This returns a handle (which is Future) thats needs to be `awaited` or `blocked` on to perform the operation.
+    /// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
     ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
+    /// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
     /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
     ///
     /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    /// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
     ///
     /// # Examples
     /// ```
@@ -463,101 +385,21 @@ impl<T: Send + Sync> LocalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = world.block_on(async move {five.into_darc()});
+    /// let five_as_darc = five.into_darc().block();
     /// ```
-    pub async fn into_darc(self) -> Darc<T> {
-        let inner = self.inner();
-        // println!("into_darc");
-        // self.print();
-        DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::Darc,
-            0,
-        )
-        .await;
-        // println!("after block on outstanding");
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
-        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
-
-        let item: T = loop {
-            arc_item = match Arc::try_unwrap(arc_item) {
-                Ok(item) => break item.into_inner(),
-                Err(arc_item) => arc_item,
-            };
-        };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<T>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
+    pub fn into_darc(self) -> IntoDarcHandle<T> {
+        let wrapped_inner = WrappedInner {
+            inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
+                .expect("invalid darc pointer"),
         };
-        d.inner_mut().update_item(Box::into_raw(Box::new(item))); //the darc will free this approriately
-        d
-    }
-
-    #[doc(alias = "Collective")]
-    /// Converts this LocalRwDarc into a regular [Darc]
-    ///
-    /// This is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
-    ///
-    /// Furthermore, this call will block while any additional references outside of the one making this call exist on each PE. It is not possible for the
-    /// pointed to object to wrapped by both a Darc and a LocalRwDarc simultaneously (on any PE).
-    ///
-    /// # Collective Operation
-    /// Requires all PEs associated with the `darc` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
-    ///
-    /// # Examples
-    /// ```
-    /// use lamellar::darc::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    ///
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_darc = five.blocking_into_darc();
-    /// ```
-    pub fn blocking_into_darc(self) -> Darc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalRwDarc::blocking_into_darc` from within an async context which may lead to deadlock, it is recommended that you use `into_darc().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
+        let team = self.darc.inner().team().clone();
+        IntoDarcHandle {
+            darc: self.into(),
+            team,
+            outstanding_future: Box::pin(async move {
+                DarcInner::block_on_outstanding(wrapped_inner, DarcMode::Darc, 0).await;
+            }),
         }
-        let inner = self.inner();
-        // println!("into_darc");
-        // self.print();
-        inner.team().block_on(DarcInner::block_on_outstanding(
-            WrappedInner {
-                inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
-                    .expect("invalid darc pointer"),
-            },
-            DarcMode::Darc,
-            0,
-        ));
-        // println!("after block on outstanding");
-        inner.local_cnt.fetch_add(1, Ordering::SeqCst); //we add this here because to account for moving inner into d
-                                                        // let item = unsafe { Box::from_raw(inner.item as *mut Arc<RwLock<T>>).into_inner() };
-        let mut arc_item = unsafe { (*Box::from_raw(inner.item as *mut Arc<RwLock<T>>)).clone() };
-
-        let item: T = loop {
-            arc_item = match Arc::try_unwrap(arc_item) {
-                Ok(item) => break item.into_inner(),
-                Err(arc_item) => arc_item,
-            };
-        };
-        let d = Darc {
-            inner: self.darc.inner as *mut DarcInner<T>,
-            src_pe: self.darc.src_pe,
-            // phantom: PhantomData,
-        };
-        d.inner_mut().update_item(Box::into_raw(Box::new(item))); //the darc will free this approriately
-        d
     }
 }
 
diff --git a/src/scheduler/tokio_executor.rs b/src/scheduler/tokio_executor.rs
index cbe80de3..fe358edf 100644
--- a/src/scheduler/tokio_executor.rs
+++ b/src/scheduler/tokio_executor.rs
@@ -3,6 +3,7 @@ use crate::scheduler::{Executor, LamellarExecutor, LamellarTask, LamellarTaskInn
 use tokio::runtime::Runtime;
 
 use futures_util::Future;
+use std::sync::Arc;
 
 #[derive(Debug)]
 pub(crate) struct TokioRt {
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index f98f640b..387f44fa 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -64,7 +64,7 @@ macro_rules! check_val {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
@@ -475,7 +475,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
             #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(&input_array.blocking_read_local_data(),1).spawn()};
+            let _ =  unsafe{ array.batch_add(&input_array.read_local_data().block(),1).spawn()};
             check_results!($array,array,num_pes,"&LocalLockArray<T>");
             println!("passed &LocalLockArray<T>");
 
@@ -485,7 +485,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
             // GlobalLockArray<T>------------------------------
             #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(&input_array.blocking_read_local_data(),1).spawn()};
+            let _ =  unsafe{ array.batch_add(&input_array.read_local_data().block(),1).spawn()};
             check_results!($array,array,num_pes,"&GlobalLockArray<T>");
             println!("passed &GlobalLockArray<T>");
        }
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index 2f7bdae9..29d88632 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -68,7 +68,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 0aeb3a16..c14438ff 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -92,7 +92,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
@@ -102,7 +102,8 @@ macro_rules! onesided_iter {
 macro_rules! buffered_onesided_iter {
     (GlobalLockArray,$array:ident) => {
         $array
-            .blocking_read_lock()
+            .read_lock()
+            .block()
             .buffered_onesided_iter($array.len())
     };
     ($arraytype:ident,$array:ident) => {
@@ -586,7 +587,7 @@ macro_rules! input_test{
             //  check_results!($array,array,num_pes,reqs,"LocalLockArray<T>");
             // LocalLockArray<T>------------------------------
             let mut reqs = vec![];
-            let local_data = input_array.blocking_read_local_data();
+            let local_data = input_array.read_local_data().block();
             // println!("local lock array len: {:?}", local_data.deref());
             #[allow(unused_unsafe)]
             reqs.push(unsafe{array.batch_fetch_add(&local_data,1)});
@@ -602,7 +603,7 @@ macro_rules! input_test{
             // GlobalLockArray<T>------------------------------
             let mut reqs = vec![];
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(&input_array.blocking_read_local_data(),1)});
+            reqs.push(unsafe{array.batch_fetch_add(&input_array.read_local_data().block(),1)});
             check_results!($array,array,num_pes,reqs,"&GlobalLockArray<T>");
        }
     }
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index d539dace..427915a9 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -98,7 +98,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index 21c0b41d..985954e9 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -93,7 +93,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/arithmetic_ops/fetch_rem_test.rs b/tests/array/arithmetic_ops/fetch_rem_test.rs
index 888572d2..83b47839 100644
--- a/tests/array/arithmetic_ops/fetch_rem_test.rs
+++ b/tests/array/arithmetic_ops/fetch_rem_test.rs
@@ -98,7 +98,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index 864fa5ec..68ea6d04 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -88,7 +88,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index 5d937e3d..4f76ddb9 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -76,7 +76,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/arithmetic_ops/rem_test.rs b/tests/array/arithmetic_ops/rem_test.rs
index 0b95f5c7..daf07dde 100644
--- a/tests/array/arithmetic_ops/rem_test.rs
+++ b/tests/array/arithmetic_ops/rem_test.rs
@@ -68,7 +68,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index 6d409f1a..2cd8382b 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -72,7 +72,7 @@ macro_rules! max_updates {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index 90ab7b51..de4993eb 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -57,7 +57,7 @@ macro_rules! check_val {
 
 // macro_rules! onesided_iter{
 //     (GlobalLockArray,$array:ident) => {
-//         $array.blocking_read_lock().onesided_iter()
+//         $array.read_lock().block().onesided_iter()
 //     };
 //     ($arraytype:ident,$array:ident) => {
 //        $array.onesided_iter()
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 0e920739..82105b15 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -59,7 +59,7 @@ macro_rules! check_val {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/bitwise_ops/fetch_and_test.rs b/tests/array/bitwise_ops/fetch_and_test.rs
index 9a15290e..25fcdce1 100644
--- a/tests/array/bitwise_ops/fetch_and_test.rs
+++ b/tests/array/bitwise_ops/fetch_and_test.rs
@@ -59,7 +59,7 @@ macro_rules! check_val {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/bitwise_ops/fetch_or_test.rs b/tests/array/bitwise_ops/fetch_or_test.rs
index f635dacb..22739189 100644
--- a/tests/array/bitwise_ops/fetch_or_test.rs
+++ b/tests/array/bitwise_ops/fetch_or_test.rs
@@ -59,7 +59,7 @@ macro_rules! check_val {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/bitwise_ops/fetch_xor_test.rs b/tests/array/bitwise_ops/fetch_xor_test.rs
index 8302a766..99e4a45f 100644
--- a/tests/array/bitwise_ops/fetch_xor_test.rs
+++ b/tests/array/bitwise_ops/fetch_xor_test.rs
@@ -59,7 +59,7 @@ macro_rules! check_val {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index 5c697397..43c7bc01 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -59,7 +59,7 @@ macro_rules! check_val {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/bitwise_ops/xor_test.rs b/tests/array/bitwise_ops/xor_test.rs
index 4bbe6472..24754011 100644
--- a/tests/array/bitwise_ops/xor_test.rs
+++ b/tests/array/bitwise_ops/xor_test.rs
@@ -57,7 +57,7 @@ macro_rules! check_val {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index f5643207..c7c62e90 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -46,7 +46,7 @@ macro_rules! initialize_array {
 
 macro_rules! onesided_iter {
     (GlobalLockArray,$array:ident) => {
-        $array.blocking_read_lock().onesided_iter()
+        $array.read_lock().block().onesided_iter()
     };
     ($arraytype:ident,$array:ident) => {
         $array.onesided_iter()

From 6ddccb2549023487dbe80baa7d65d301eb4fce60 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 17 Oct 2024 13:32:18 -0700
Subject: [PATCH 099/116] updated local chunks to utilize handles instead of
 blocking api

---
 Cargo.toml                                    |   6 +-
 examples/array_examples/global_lock_array.rs  |  14 +-
 examples/array_examples/onesided_iteration.rs | 253 +++++++++---------
 examples/darc_examples/darc.rs                |   2 +-
 run_examples.sh                               |   8 +-
 src/array.rs                                  |  27 +-
 src/array/global_lock_atomic.rs               |   6 +-
 src/array/global_lock_atomic/handle.rs        |  50 ++--
 src/array/iterator/local_iterator.rs          |   4 +-
 src/array/local_lock_atomic.rs                |   4 +-
 src/array/local_lock_atomic/handle.rs         | 227 ++++++++++++++--
 src/array/local_lock_atomic/local_chunks.rs   | 167 ++++--------
 src/darc/global_rw_darc.rs                    |  34 ++-
 src/darc/handle.rs                            |  72 +++--
 14 files changed, 530 insertions(+), 344 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 52c206dd..b316d1dc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -366,9 +366,9 @@ path="examples/array_examples/global_lock_array.rs"
 name="histo"
 path="examples/array_examples/histo.rs"
 
-[[example]]
-name="single_pe_array"
-path="examples/array_examples/single_pe_array.rs"
+#[[example]]
+#name="single_pe_array"
+#path="examples/array_examples/single_pe_array.rs"
 
 ##------------ RDMA Examples -----------------##
 [[example]]
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index dc18ee1f..82cbe5b6 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -11,7 +11,7 @@ fn main() {
     let s = Instant::now();
     let local_data = array.read_local_data().block();
     println!(
-        "PE{my_pe} time: {:?} {:?}",
+        "0. PE{my_pe} time: {:?} {:?}",
         s.elapsed().as_secs_f64(),
         local_data
     );
@@ -21,7 +21,7 @@ fn main() {
     world.barrier();
     let mut local_data = array.write_local_data().block();
     println!(
-        "PE{my_pe} time: {:?} got write lock",
+        "1. PE{my_pe} time: {:?} got write lock",
         s.elapsed().as_secs_f64()
     );
     local_data.iter_mut().for_each(|elem| *elem = my_pe);
@@ -29,23 +29,23 @@ fn main() {
     drop(local_data);
 
     array.print();
-    println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
+    println!("2 .PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
-    let mut local_data = array.collective_write_local_data().block();
+    let mut local_data = world.block_on(array.collective_write_local_data());
     println!(
-        "PE{my_pe} time: {:?} got collective write lock",
+        "3. PE{my_pe} time: {:?} got collective write lock",
         s.elapsed().as_secs_f64()
     );
     local_data.iter_mut().for_each(|elem| *elem += my_pe);
     std::thread::sleep(Duration::from_secs(1));
     drop(local_data);
     println!(
-        "PE{my_pe} time: {:?} dropped collective write lock",
+        "4. PE{my_pe} time: {:?} dropped collective write lock",
         s.elapsed().as_secs_f64()
     );
 
     array.print();
-    println!("PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
+    println!("5. PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
     array
         .read_lock()
diff --git a/examples/array_examples/onesided_iteration.rs b/examples/array_examples/onesided_iteration.rs
index 1ffb25bf..f38a6b55 100644
--- a/examples/array_examples/onesided_iteration.rs
+++ b/examples/array_examples/onesided_iteration.rs
@@ -1,136 +1,139 @@
-// use lamellar::array::prelude::*;
-// const ARRAY_LEN: usize = 100;
-
-// fn main() {
-//     let world = lamellar::LamellarWorldBuilder::new().build();
-//     let my_pe = world.my_pe();
-//     let _num_pes = world.num_pes();
-//     let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-//     let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
-
-//     //we are going to initialize the data on each PE by directly accessing its local data
-
-//     block_array
-//         .mut_local_data()
-//         .iter()
-//         .for_each(|e| e.store(my_pe));
-//     cyclic_array
-//         .mut_local_data()
-//         .iter()
-//         .for_each(|e| e.store(my_pe));
-
-//     // In this example we will make use of a onesided iterator which
-//     // enables us to iterate over the entire array on a single PE.
-//     // The runtime will manage transferring data from remote PEs.
-//     // Note that for UnsafeArrays, AtomicArrays, and LocalLockArrays,
-//     // there is no guarantee that by the time the transferred data
-//     // as arrived to the calling PE it has remained the same on the remote PE.
-//     // we do not currently provide a mutable one sided iterator.
-
-//     if my_pe == 0 {
-//         println!("Here");
-//         for elem in block_array.onesided_iter().into_iter() {
-//             //we can convert from a oneside iterator into a rust iterator
-//             print!("{:?} ", elem);
-//         }
-//         println!("");
-//         println!("Here2");
-//         for elem in cyclic_array.onesided_iter().into_iter() {
-//             print!("{:?} ", elem);
-//         }
-//         println!("");
-//     }
-//     println!("Here3");
-//     println!("--------------------------------------------------------");
-
-//     // The lamellar array iterator used above is lazy, meaning that it only accesses and returns a value as its used,
-//     // while this is generally efficent and results in low overhead, because an elem may actually exists on a remote node
-//     // latencies to retrieve the next value in the iterator are dependent on the location of the data, as a result of
-//     // the need to get the data. Further impacting performance is that typically the transfer of a single element will
-//     // likely be small, thus inefficiently utilizing network resources.
-//     // to address these issues, we have provided a buffered iterator, which will transfer "get" and store a block of data
-//     // into a buffer, from with the iterated values are returned. More effectively using network resources. From the users
-//     // standpoint the only thing that changes is the instatiation of the iterator.
-
-//     if my_pe == 0 {
-//         for elem in block_array.buffered_onesided_iter(10).into_iter() {
-//             print!("{:?} ", elem);
-//         }
-//         println!("");
-
-//         for elem in cyclic_array.buffered_onesided_iter(10).into_iter() {
-//             print!("{:?} ", elem);
-//         }
-//         println!("");
-//     }
-
-//     println!("--------------------------------------------------------");
-
-//     // in addition to the buffered iters we also provide a method to iterate over chunks of a lamellar array, via
-//     // the chunks() method. Called on a OneSidedIterator this creates a chunk sized OneSidedMemoryRegion,
-//     // and then puts the appropriate date based on the iteration index into that region
-
-//     if my_pe == 0 {
-//         for chunk in block_array.onesided_iter().chunks(10).skip(4).into_iter() {
-//             println!("{:?}", unsafe { chunk.as_slice() });
-//         }
-//         println!("-----");
-//         for chunk in cyclic_array.onesided_iter().chunks(10).into_iter() {
-//             println!("{:?}", unsafe { chunk.as_slice() });
-//         }
-
-//         println!("-----");
-//         for (i, (a, b)) in cyclic_array
-//             .onesided_iter()
-//             .zip(block_array.onesided_iter())
-//             .into_iter()
-//             .enumerate()
-//         {
-//             println!("{:?}: {:?} {:?}", i, a, b);
-//         }
-//         println!("-----");
-//         for (a, b) in cyclic_array
-//             .onesided_iter()
-//             .chunks(10)
-//             .zip(block_array.onesided_iter().chunks(10))
-//             .into_iter()
-//         {
-//             unsafe {
-//                 println!("{:?} {:?}", a.as_slice(), b.as_slice());
-//             }
-//         }
-//     }
-
-//     println!("--------------------------------------------------------");
-
-//     // let block_array = UnsafeArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-//     // for elem in block_onesided_iter!($array,array).into_iter().step_by(4) {...}
-//     // for elem in block_array.buffered_onesided_iter(10) {...}
-
-//     // //rust step_by pseudo code
-//     // fn step_by(&mut self, n: usize) -> Result<T>{
-//     //     let val = self.next(); //grab val based on index
-//     //     self.index += n;
-//     //     val
-//     // }
-
-//     // //--------------
-//     // for elem in block_array.onesided_iter().step_by(4).into_iter() {...}
-// }
-
 use futures_util::stream::StreamExt;
 use lamellar::array::prelude::*;
+
+const ARRAY_LEN: usize = 100;
+
 fn main() {
-    let world = LamellarWorldBuilder::new().build();
-    let array = LocalLockArray::<usize>::new(&world, 8, Distribution::Block);
+    let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    array.dist_iter_mut().for_each(move |e| *e = my_pe); //initialize array using a distributed iterator
-    array.wait_all();
+    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
+
+    //we are going to initialize the data on each PE by directly accessing its local data
+
+    block_array
+        .mut_local_data()
+        .iter()
+        .for_each(|e| e.store(my_pe));
+    cyclic_array
+        .mut_local_data()
+        .iter()
+        .for_each(|e| e.store(my_pe));
+
+    // In this example we will make use of a onesided iterator which
+    // enables us to iterate over the entire array on a single PE.
+    // The runtime will manage transferring data from remote PEs.
+    // Note that for UnsafeArrays, AtomicArrays, and LocalLockArrays,
+    // there is no guarantee that by the time the transferred data
+    // as arrived to the calling PE it has remained the same on the remote PE.
+    // we do not currently provide a mutable one sided iterator.
+
+    if my_pe == 0 {
+        println!("Here");
+        for elem in block_array.onesided_iter().into_iter() {
+            //we can convert from a oneside iterator into a rust iterator
+            print!("{:?} ", elem);
+        }
+        println!("");
+        println!("Here2");
+        for elem in cyclic_array.onesided_iter().into_iter() {
+            print!("{:?} ", elem);
+        }
+        println!("");
+    }
+    println!("Here3");
+    println!("--------------------------------------------------------");
+
+    // The lamellar array iterator used above is lazy, meaning that it only accesses and returns a value as its used,
+    // while this is generally efficent and results in low overhead, because an elem may actually exists on a remote node
+    // latencies to retrieve the next value in the iterator are dependent on the location of the data, as a result of
+    // the need to get the data. Further impacting performance is that typically the transfer of a single element will
+    // likely be small, thus inefficiently utilizing network resources.
+    // to address these issues, we have provided a buffered iterator, which will transfer "get" and store a block of data
+    // into a buffer, from with the iterated values are returned. More effectively using network resources. From the users
+    // standpoint the only thing that changes is the instatiation of the iterator.
+
+    if my_pe == 0 {
+        for elem in block_array.buffered_onesided_iter(10).into_iter() {
+            print!("{:?} ", elem);
+        }
+        println!("");
+
+        for elem in cyclic_array.buffered_onesided_iter(10).into_iter() {
+            print!("{:?} ", elem);
+        }
+        println!("");
+    }
+
+    println!("--------------------------------------------------------");
+
+    // in addition to the buffered iters we also provide a method to iterate over chunks of a lamellar array, via
+    // the chunks() method. Called on a OneSidedIterator this creates a chunk sized OneSidedMemoryRegion,
+    // and then puts the appropriate date based on the iteration index into that region
+
+    if my_pe == 0 {
+        for chunk in block_array.onesided_iter().chunks(10).skip(4).into_iter() {
+            println!("{:?}", unsafe { chunk.as_slice() });
+        }
+        println!("-----");
+        for chunk in cyclic_array.onesided_iter().chunks(10).into_iter() {
+            println!("{:?}", unsafe { chunk.as_slice() });
+        }
+
+        println!("-----");
+        for (i, (a, b)) in cyclic_array
+            .onesided_iter()
+            .zip(block_array.onesided_iter())
+            .into_iter()
+            .enumerate()
+        {
+            println!("{:?}: {:?} {:?}", i, a, b);
+        }
+        println!("-----");
+        for (a, b) in cyclic_array
+            .onesided_iter()
+            .chunks(10)
+            .zip(block_array.onesided_iter().chunks(10))
+            .into_iter()
+        {
+            unsafe {
+                println!("{:?} {:?}", a.as_slice(), b.as_slice());
+            }
+        }
+    }
+
+    println!("--------------------------------------------------------");
+
+    // let block_array = UnsafeArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    // for elem in block_onesided_iter!($array,array).into_iter().step_by(4) {...}
+    // for elem in block_array.buffered_onesided_iter(10) {...}
+
+    // //rust step_by pseudo code
+    // fn step_by(&mut self, n: usize) -> Result<T>{
+    //     let val = self.next(); //grab val based on index
+    //     self.index += n;
+    //     val
+    // }
+
+    // //--------------
+    // for elem in block_array.onesided_iter().step_by(4).into_iter() {...}
+    // }
+
+    // fn main() {
+    //     let world = LamellarWorldBuilder::new().build();
+    //     let array = LocalLockArray::<usize>::new(&world, 8, Distribution::Block);
+    //     let my_pe = world.my_pe();
+    //     let num_pes = world.num_pes();
+    let block_array = block_array.into_local_lock();
+    block_array
+        .dist_iter_mut()
+        .for_each(move |e| *e = my_pe)
+        .block(); //initialize array using a distributed iterator
 
     world.block_on(async move {
         if my_pe == 0 {
-            let result = array
+            let result = block_array
                 .onesided_iter()
                 .into_stream()
                 .take(4)
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index 3a9215b8..dbfffdce 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -109,7 +109,7 @@ fn main() {
     // drop(darc2);
     // drop(wrapped);
     println!("changing darc type");
-    let ro_darc = global_darc.blocking_into_localrw().blocking_into_darc(); // we can call into_darc directly on global_Darc, but string the operations for testing purposes
+    let ro_darc = global_darc.into_localrw().block().into_darc().block(); // we can call into_darc directly on global_Darc, but string the operations for testing purposes
     println!("read only darc");
     ro_darc.print();
     println!("done");
diff --git a/run_examples.sh b/run_examples.sh
index 881b440f..e26e1a6a 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -44,11 +44,11 @@ for toolchain in stable; do #nightly; do
         fi
       cd ..
       sleep 2
-      cur_tasks=`squeue -u frie869 | wc -l`
-      running_tasks=`squeue -u frie869 | grep " R " | wc -l`
+      cur_tasks=`squeue -u frie869 | grep frie869 |wc -l`
+      running_tasks=`squeue -u frie869 | grep frie869| grep " R " | wc -l`
       while [ $((cur_tasks+running_tasks)) -gt 6 ]; do
-        cur_tasks=`squeue -u frie869 | wc -l`
-        running_tasks=`squeue -u frie869 | grep " R " | wc -l`
+        cur_tasks=`squeue -u frie869 | grep frie869 | wc -l`
+        running_tasks=`squeue -u frie869 | grep frie869 | grep " R " | wc -l`
         sleep 5
       done   
       # fi   
diff --git a/src/array.rs b/src/array.rs
index 63e3ab12..383fdbd8 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -191,29 +191,30 @@ pub struct ReduceKey {
 crate::inventory::collect!(ReduceKey);
 
 // impl Dist for bool {}
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
 
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// //------------------------------------
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-// lamellar_impl::generate_ops_for_bool_rt!();
+lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index d722afad..c7a126c0 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -996,15 +996,17 @@ impl<T: Dist + std::fmt::Debug> GlobalLockArray<T> {
     ///```
     pub fn print(&self) {
         self.barrier();
-        let _guard = self.read_local_data();
+        // println!("printing array");
+        let _guard = self.read_local_data().block();
         self.array.print();
+        // println!("done printing array");
     }
 }
 
 impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GlobalLockArray<T> {
     fn print(&self) {
         self.barrier();
-        let _guard = self.read_local_data();
+        let _guard = self.read_local_data().block();
         self.array.print()
     }
 }
diff --git a/src/array/global_lock_atomic/handle.rs b/src/array/global_lock_atomic/handle.rs
index d15e7da1..f37b7622 100644
--- a/src/array/global_lock_atomic/handle.rs
+++ b/src/array/global_lock_atomic/handle.rs
@@ -65,13 +65,15 @@ impl<T: Dist> GlobalLockReadHandle<T> {
     /// let guard = handle.block();
     ///```
     pub fn block(self) -> GlobalLockReadGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalLockReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         self.array.lock.darc.team().scheduler.block_on(self)
@@ -140,13 +142,15 @@ impl<T: Dist> GlobalLockLocalDataHandle<T> {
     /// println!("local data: {:?}",local_data);
     ///```
     pub fn block(self) -> GlobalLockLocalData<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         self.array.lock.darc.team().scheduler.block_on(self)
@@ -219,13 +223,15 @@ impl<T: Dist> GlobalLockWriteHandle<T> {
     /// handle.block();
     ///```
     pub fn block(self) -> GlobalLockWriteGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalLockWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         self.array.lock.darc.team().scheduler.block_on(self)
@@ -294,13 +300,15 @@ impl<T: Dist> GlobalLockMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn block(self) -> GlobalLockMutLocalData<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         self.array.lock.darc.team().scheduler.block_on(self)
@@ -373,13 +381,15 @@ impl<T: Dist> GlobalLockCollectiveMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn block(self) -> GlobalLockCollectiveMutLocalData<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalLockCollectiveMutLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         self.array.lock.darc.team().scheduler.block_on(self)
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 01d7536f..cd719d13 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -766,8 +766,8 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
     /// If the number of elements is not evenly divisible by `size`, the last chunk may be shorter than `size`
     ///
     /// # Note
-    /// If calling this on a LocalLockArray it may be possible to call [blocking_read_local_chunks](crate::array::LocalLockArray::blocking_read_local_chunks), [read_local_chunks](crate::array::LocalLockArray::read_local_chunks)
-    /// [blocking_write_local_chunks](crate::array::LocalLockArray::blocking_write_local_chunks), or [write_local_chunks](crate::array::LocalLockArray::blocking_write_local_chunks) for better performance
+    /// If calling this on a LocalLockArray it may be possible to call  [read_local_chunks](crate::array::LocalLockArray::read_local_chunks)
+    /// or [write_local_chunks](crate::array::LocalLockArray::write_local_chunks) for better performance
     ///
     /// If calling this on an UnsafeArray it may be possible to call [local_chunks](crate::array::UnsafeArray::local_chunks) or [local_chunks_mut](crate::array::UnsafeArray::local_chunks_mut)
     ///
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index f13d8515..43d0a4bf 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -3,8 +3,8 @@ pub(crate) mod local_chunks;
 pub use local_chunks::{LocalLockLocalChunks, LocalLockLocalChunksMut};
 mod handle;
 use handle::{
-    LocalLockLocalDataHandle, LocalLockMutLocalDataHandle, LocalLockReadHandle,
-    LocalLockWriteHandle,
+    LocalLockLocalChunksHandle, LocalLockLocalChunksMutHandle, LocalLockLocalDataHandle,
+    LocalLockMutLocalDataHandle, LocalLockReadHandle, LocalLockWriteHandle,
 };
 pub(crate) mod operations;
 mod rdma;
diff --git a/src/array/local_lock_atomic/handle.rs b/src/array/local_lock_atomic/handle.rs
index 23111fae..2c42dc5e 100644
--- a/src/array/local_lock_atomic/handle.rs
+++ b/src/array/local_lock_atomic/handle.rs
@@ -10,7 +10,10 @@ use crate::LocalLockArray;
 use futures_util::Future;
 use pin_project::pin_project;
 
-use super::{LocalLockLocalData, LocalLockMutLocalData, LocalLockReadGuard, LocalLockWriteGuard};
+use super::{
+    LocalLockLocalChunks, LocalLockLocalChunksMut, LocalLockLocalData, LocalLockMutLocalData,
+    LocalLockReadGuard, LocalLockWriteGuard,
+};
 
 #[must_use]
 #[pin_project]
@@ -49,7 +52,7 @@ impl<T: Dist> LocalLockReadHandle<T> {
         }
     }
 
-    /// Handle used to retrieve the aquired read lock of a LocalLockArray within a non async context
+    /// Blocks the calling thread to retrieve the aquired read lock of a LocalLockArray within a non async context
     ///
     /// Returns an RAII guard which will drop the read access of the wrlock when dropped
     /// # Examples
@@ -62,15 +65,16 @@ impl<T: Dist> LocalLockReadHandle<T> {
     /// let guard = handle.block();
     ///```
     pub fn block(self) -> LocalLockReadGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalLockReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
-
         self.array.lock.darc.team().scheduler.block_on(self)
     }
 }
@@ -122,7 +126,7 @@ pub struct LocalLockLocalDataHandle<T: Dist> {
 }
 
 impl<T: Dist> LocalLockLocalDataHandle<T> {
-    /// Handle used to retrieve the aquired local data [LocalLockLocalData] of a LocalLockArray within a non async context
+    /// Blocks the calling thread to retrieve the aquired local data [LocalLockLocalData] of a LocalLockArray within a non async context
     ///
     /// Returns an RAII guard which will drop the write access of the wrlock when dropped
     /// # Examples
@@ -137,15 +141,16 @@ impl<T: Dist> LocalLockLocalDataHandle<T> {
     /// println!("local data: {:?}",local_data);
     ///```
     pub fn block(self) -> LocalLockLocalData<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
-
         self.array.lock.darc.team().scheduler.block_on(self)
     }
 }
@@ -203,7 +208,7 @@ impl<T: Dist> LocalLockWriteHandle<T> {
             lock_handle: array.lock.write(),
         }
     }
-    /// Handle used to retrieve the aquired write lock of a LocalLockArray within a non async context
+    /// Blocks the calling thread to retrieve the aquired write lock of a LocalLockArray within a non async context
     ///
     /// Returns an RAII guard which will drop the write access of the wrlock when dropped
     /// # Examples
@@ -216,13 +221,15 @@ impl<T: Dist> LocalLockWriteHandle<T> {
     /// handle.block();
     ///```
     pub fn block(self) -> LocalLockWriteGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalLockWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         self.array.lock.darc.team().scheduler.block_on(self)
@@ -276,7 +283,7 @@ pub struct LocalLockMutLocalDataHandle<T: Dist> {
 }
 
 impl<T: Dist> LocalLockMutLocalDataHandle<T> {
-    /// Handle used to retrieve the aquired mutable local data [LocalLockMutLocalData] of a LocalLockArray within a non async context
+    /// Blocks the calling thread to retrieve the aquired mutable local data [LocalLockMutLocalData] of a LocalLockArray within a non async context
     ///
     /// Returns an RAII guard which will drop the write access of the wrlock when dropped
     /// # Examples
@@ -291,13 +298,15 @@ impl<T: Dist> LocalLockMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn block(self) -> LocalLockMutLocalData<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         self.array.lock.darc.team().scheduler.block_on(self)
@@ -320,3 +329,175 @@ impl<T: Dist> Future for LocalLockMutLocalDataHandle<T> {
         }
     }
 }
+
+#[must_use]
+#[pin_project]
+/// Constructs a handle for immutably iterating over fixed sized chunks(slices) of the local data of this array.
+/// This handle must be either await'd in an async context or block'd in an non-async context.
+/// Awaiting or blocking will not return until the read lock has been acquired.
+///
+/// the returned iterator is a lamellar [LocalIterator] and also captures a read lock on the local data.
+///
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+/// let my_pe = world.my_pe();
+/// //block in a non-async context
+/// let _ = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
+///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+/// }).block();
+///
+/// //await in an async context
+/// world.block_on(async move {
+///     let _ = array.read_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
+///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+///     }).await;
+/// });
+///
+/// ```
+pub struct LocalLockLocalChunksHandle<T> {
+    pub(crate) chunk_size: usize,
+    pub(crate) index: usize,     //global index within the array local data
+    pub(crate) end_index: usize, //global index within the array local data
+    pub(crate) array: LocalLockArray<T>,
+    #[pin]
+    pub(crate) lock_handle: LocalRwDarcReadHandle<()>,
+}
+
+impl<T: Dist> LocalLockLocalChunksHandle<T> {
+    /// Blocks the calling thread to retrieve the aquired immutable local chunks iterator of a LocalLockArray within a non async context
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    /// //block in a non-async context
+    /// let _ = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
+    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    /// }).block();
+    ///```
+    pub fn block(self) -> LocalLockLocalChunks<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockLocalChunksHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for LocalLockLocalChunksHandle<T> {
+    type Output = LocalLockLocalChunks<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(LocalLockLocalChunks {
+                chunk_size: *this.chunk_size,
+                index: *this.index, //global index within the array local data
+                end_index: *this.end_index, //global index within the array local data
+                array: this.array.clone(),
+                lock_guard: Arc::new(val),
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+#[must_use]
+#[pin_project]
+/// A handle for mutably iterating over fixed sized chunks(slices) of the local data of this array.
+/// This handle must be either await'd in an async context or block'd in an non-async context.
+/// Awaiting or blocking will not return until the write lock has been acquired.
+///
+/// the returned iterator is a lamellar [LocalIterator] and also captures a write lock on the local data.
+///
+/// # Examples
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+/// let my_pe = world.my_pe();
+/// let _ = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
+///         for elem in chunk.iter_mut() {
+///             *elem = i;
+///         }
+///     }).block();
+/// world.block_on(async move {
+///     let _ = array.write_local_chunks(5).await.enumerate().for_each(move|(i, mut chunk)| {
+///         for elem in chunk.iter_mut() {
+///             *elem = i;
+///         }
+///     }).await;
+/// });
+/// ```
+pub struct LocalLockLocalChunksMutHandle<T> {
+    pub(crate) chunk_size: usize,
+    pub(crate) index: usize,     //global index within the array local data
+    pub(crate) end_index: usize, //global index within the array local data
+    pub(crate) array: LocalLockArray<T>,
+    #[pin]
+    pub(crate) lock_handle: LocalRwDarcWriteHandle<()>,
+}
+
+impl<T: Dist> LocalLockLocalChunksMutHandle<T> {
+    /// Blocks the calling thread to retrieve the aquired mutable local chunks iterator of a LocalLockArray within a non async context
+    ///
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    /// //block in a non-async context
+    /// let _ = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
+    ///         for elem in chunk.iter_mut() {
+    ///             *elem = i;
+    ///         }
+    ///     }).block();
+    ///```
+    pub fn block(self) -> LocalLockLocalChunksMut<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `LocalLockLocalChunksMutHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
+
+        self.array.lock.darc.team().scheduler.block_on(self)
+    }
+}
+
+impl<T: Dist> Future for LocalLockLocalChunksMutHandle<T> {
+    type Output = LocalLockLocalChunksMut<T>;
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.project();
+        match this.lock_handle.poll(cx) {
+            Poll::Ready(val) => Poll::Ready(LocalLockLocalChunksMut {
+                chunk_size: *this.chunk_size,
+                index: *this.index, //global index within the array local data
+                end_index: *this.end_index, //global index within the array local data
+                array: this.array.clone(),
+                lock_guard: Arc::new(val),
+            }),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index dfdeb759..fe7e627c 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -9,16 +9,14 @@ use crate::memregion::Dist;
 use std::sync::Arc;
 
 /// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of a [LocalLockArray]
-/// This struct is created by calling [LocalLockArray::read_local_chunks] or [LocalLockArray::blocking_read_local_chunks]
+/// This struct is created by awaiting or blocking on the handle returned by [LocalLockArray::read_local_chunks]
 #[derive(Clone)]
 pub struct LocalLockLocalChunks<T: Dist> {
-    chunk_size: usize,
-    index: usize,     //global index within the array local data
-    end_index: usize, //global index within the array local data
-    array: LocalLockArray<T>,
-    // lock: LocalRwDarc<()>,
-    // lock_guard: Arc<RwLockReadGuardArc<()>>,
-    lock_guard: Arc<LocalRwDarcReadGuard<()>>,
+    pub(crate) chunk_size: usize,
+    pub(crate) index: usize,     //global index within the array local data
+    pub(crate) end_index: usize, //global index within the array local data
+    pub(crate) array: LocalLockArray<T>,
+    pub(crate) lock_guard: Arc<LocalRwDarcReadGuard<()>>,
 }
 
 impl<T: Dist> IterClone for LocalLockLocalChunks<T> {
@@ -35,15 +33,15 @@ impl<T: Dist> IterClone for LocalLockLocalChunks<T> {
 }
 
 /// An iterator over mutable (nonoverlapping) local chunks (of size chunk_size) of a [LocalLockArray]
-/// This struct is created by calling [LocalLockArray""write_local_chunks] or [LocalLockArray::blocking_write_local_chunks]
+/// This struct is created by awaiting or blocking on the handle returned by [LocalLockArray::write_local_chunks]
 pub struct LocalLockLocalChunksMut<T: Dist> {
     // data: &'a mut [T],
-    chunk_size: usize,
-    index: usize,     //global index within the array local data
-    end_index: usize, //global index within the array local data
-    array: LocalLockArray<T>,
+    pub(crate) chunk_size: usize,
+    pub(crate) index: usize,     //global index within the array local data
+    pub(crate) end_index: usize, //global index within the array local data
+    pub(crate) array: LocalLockArray<T>,
     // lock: LocalRwDarc<()>,
-    lock_guard: Arc<LocalRwDarcWriteGuard<()>>,
+    pub(crate) lock_guard: Arc<LocalRwDarcWriteGuard<()>>,
 }
 
 impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
@@ -59,6 +57,12 @@ impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
     }
 }
 
+/// Provides mutable access to a chunk of a PEs local data to provide "local" indexing while maintaining safety guarantees of the array type.
+///
+/// This derefences down to a `&mut [T]`.
+///
+/// This struct is the item type returned when iterating over a [LocalLockLocalChunksMut] iterator created using [LocalLockArray::write_local_chunks].
+/// While the Local Chunk iterator is valid, each chunk is guaranteed to have exclusive access to the data it points to (allowing for the safe deref into `&mut [T]`), preventing any other local or remote access.
 #[derive(Debug)]
 pub struct LocalLockMutChunkLocalData<'a, T: Dist> {
     data: &'a mut [T],
@@ -218,40 +222,11 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalChunksMut<T> {
 }
 
 impl<T: Dist> LocalLockArray<T> {
-    /// mutably iterate over fixed sized chunks(slices) of the local data of this array.
-    /// the returned iterator is a lamellar [LocalIterator] and also captures a read lock on the local data.
-    /// This call will block the calling task until a read lock is acquired.
+    /// Constructs a handle for immutably iterating over fixed sized chunks(slices) of the local data of this array.
+    /// This handle must be either await'd in an async context or block'd in an non-async context.
+    /// Awaiting or blocking will not return until the read lock has been acquired.
     ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
-    /// let my_pe = world.my_pe();
-    /// world.block_on(async move {
-    ///     let _ = array.read_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
-    ///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    ///     }).spawn();
-    ///     array.await_all().await;
-    /// });
-    /// ```
-    pub async fn read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
-        let lock = Arc::new(self.lock.read().await);
-        LocalLockLocalChunks {
-            chunk_size,
-            index: 0,
-            end_index: 0,
-            array: self.clone(),
-            // lock: self.lock.clone(),
-            lock_guard: lock,
-        }
-    }
-
-    /// immutably iterate over fixed sized chunks(slices) of the local data of this array.
     /// the returned iterator is a lamellar [LocalIterator] and also captures a read lock on the local data.
-    /// This call will block the calling thread until a read lock is acquired.
-    /// Calling within an asynchronous block may lead to deadlock, use [read_lock](self::LocalLockArray::read_local_chunks) instead.
     ///
     /// # Examples
     ///```
@@ -260,68 +235,34 @@ impl<T: Dist> LocalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
-    ///
-    /// let _ = array.blocking_read_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
+    /// //block in a non-async context
+    /// let _ = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
     ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
     /// }).block();
     ///
-    /// ```
-    pub fn blocking_read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunks<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_read_local_chunks` from within an async context which may lead to deadlock, it is recommended that you use `read_local_chunks().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        let lock = Arc::new(self.array.block_on(self.lock.read()));
-        LocalLockLocalChunks {
-            chunk_size,
-            index: 0,
-            end_index: 0,
-            array: self.clone(),
-            // lock: self.lock.clone(),
-            lock_guard: lock,
-        }
-    }
-
-    /// mutably iterate over fixed sized chunks(slices) of the local data of this array.
-    /// the returned iterator is a lamellar [LocalIterator] and also captures the write lock on the local data.
-    /// This call will block the calling task until the write lock is acquired.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
-    /// let my_pe = world.my_pe();
+    /// //await in an async context
     /// world.block_on(async move {
-    ///     let _ = array.write_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
+    ///     let _ = array.read_local_chunks(5).await.enumerate().for_each(move|(i,chunk)| {
     ///         println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    ///     }).spawn();
-    ///     array.await_all().await;
+    ///     }).await;
     /// });
+    ///
     /// ```
-    pub async fn write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
-        let lock = Arc::new(self.lock.write().await);
-        LocalLockLocalChunksMut {
+    pub fn read_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksHandle<T> {
+        let lock = self.lock.read();
+        LocalLockLocalChunksHandle {
             chunk_size,
             index: 0,
             end_index: 0,
             array: self.clone(),
-            // lock: self.lock.clone(),
-            lock_guard: lock,
+            lock_handle: lock,
         }
     }
-
-    /// mutably iterate over fixed sized chunks(slices) of the local data of this array.
-    /// the returned iterator is a lamellar [LocalIterator] and also captures the write lock on the local data.
-    /// This call will block the calling thread until the write lock is acquired.
-    /// Calling within an asynchronous block may lead to deadlock, use [write_lock](self::LocalLockArray::write_local_chunks) instead.
+    /// Constructs a handle for mutably iterating over fixed sized chunks(slices) of the local data of this array.
+    /// This handle must be either await'd in an async context or block'd in an non-async context.
+    /// Awaiting or blocking will not return until the write lock has been acquired.
+    ///
+    /// the returned iterator is a lamellar [LocalIterator] and also captures a write lock on the local data.
     ///
     /// # Examples
     ///```
@@ -330,32 +271,28 @@ impl<T: Dist> LocalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
     /// let my_pe = world.my_pe();
-    ///
-    /// let _ = array.blocking_write_local_chunks(5).enumerate().for_each(move|(i,chunk)| {
-    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
-    /// }).spawn();
-    /// array.wait_all();
-    ///
+    /// let _ = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
+    ///         for elem in chunk.iter_mut() {
+    ///             *elem = i;
+    ///         }
+    ///     }).block();
+    /// world.block_on(async move {
+    ///     let _ = array.write_local_chunks(5).await.enumerate().for_each(move|(i, mut chunk)| {
+    ///         for elem in chunk.iter_mut() {
+    ///             *elem = i;
+    ///         }
+    ///     }).await;
+    /// });
     /// ```
-    pub fn blocking_write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMut<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockArray::blocking_write_local_chunks` from within an async context which may lead to deadlock, it is recommended that you use `write_local_chunks().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
-        let lock = Arc::new(self.array.block_on(self.lock.write()));
-        LocalLockLocalChunksMut {
+    pub fn write_local_chunks(&self, chunk_size: usize) -> LocalLockLocalChunksMutHandle<T> {
+        let lock = self.lock.write();
+        LocalLockLocalChunksMutHandle {
             chunk_size,
             index: 0,
             end_index: 0,
             array: self.clone(),
             // lock: self.lock.clone(),
-            lock_guard: lock,
+            lock_handle: lock,
         }
     }
 }
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 895cfc40..fb4d3d38 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -73,13 +73,23 @@ impl<T> DistRwLock<T> {
             while self.writer.load(Ordering::SeqCst) != self.team.num_pes {
                 async_std::task::yield_now().await;
             }
-            // println!("\t{:?} inc read count {:?} {:?}",pe,self.readers.load(Ordering::SeqCst),self.writer.load(Ordering::SeqCst));
+            // println!(
+            //     "\t{:?} inc read count {:?} {:?}",
+            //     _pe,
+            //     self.readers.load(Ordering::SeqCst),
+            //     self.writer.load(Ordering::SeqCst)
+            // );
             self.readers.fetch_add(1, Ordering::SeqCst);
             if self.writer.load(Ordering::SeqCst) == self.team.num_pes {
                 break;
             }
             self.readers.fetch_sub(1, Ordering::SeqCst);
-            // println!("\t{:?} writers exist dec read count {:?} {:?}",pe,self.readers.load(Ordering::SeqCst),self.writer.load(Ordering::SeqCst));
+            // println!(
+            //     "\t{:?} writers exist dec read count {:?} {:?}",
+            //     _pe,
+            //     self.readers.load(Ordering::SeqCst),
+            //     self.writer.load(Ordering::SeqCst)
+            // );
         }
         // println!(
         //     "\t{:?} read locked {:?} {:?}",
@@ -95,7 +105,12 @@ impl<T> DistRwLock<T> {
         {
             async_std::task::yield_now().await;
         }
-        // println!("\t{:?} write lock checking for readers {:?} {:?}",pe,self.readers.load(Ordering::SeqCst),self.writer.load(Ordering::SeqCst));
+        // println!(
+        //     "\t{:?} write lock checking for readers {:?} {:?}",
+        //     pe,
+        //     self.readers.load(Ordering::SeqCst),
+        //     self.writer.load(Ordering::SeqCst)
+        // );
         while self.readers.load(Ordering::SeqCst) != 0 {
             async_std::task::yield_now().await;
         }
@@ -108,10 +123,17 @@ impl<T> DistRwLock<T> {
     }
 
     async fn async_collective_writer_lock(&self, pe: usize, collective_cnt: usize) {
+        println!("{:?} collective writer lock {:?}", pe, collective_cnt);
         // first lets set the normal writer lock, but will set it to a unique id all the PEs should have (it is initialized to num_pes+1 and is incremented by one after each lock)
         if pe == 0 {
             self.async_writer_lock(collective_cnt).await;
         } else {
+            // println!(
+            //     "\t{:?} write lock checking for readers {:?} {:?}",
+            //     pe,
+            //     self.readers.load(Ordering::SeqCst),
+            //     self.writer.load(Ordering::SeqCst)
+            // );
             while self.writer.load(Ordering::SeqCst) != collective_cnt {
                 async_std::task::yield_now().await;
             }
@@ -187,7 +209,6 @@ impl<T> DistRwLock<T> {
         let _temp = self.collective_writer.fetch_add(1, Ordering::SeqCst);
         // println!("collective unlock PE{:?} {:?} {:?} {:?}",pe,temp,self.collective_writer.load(Ordering::SeqCst),self.team.num_pes);
         while self.collective_writer.load(Ordering::SeqCst) != self.team.num_pes {
-            //
             async_std::task::yield_now().await;
         }
         //we have all entered the unlock
@@ -217,7 +238,7 @@ struct LockAm {
 #[lamellar_impl::rt_am]
 impl LamellarAM for LockAm {
     async fn exec() {
-        // println!("In lock am {:?}",self);
+        // println!("In lock am {:?}", self);
         // let lock = {
         let rwlock = unsafe { &*(self.rwlock_addr as *mut DarcInner<DistRwLock<()>>) }.item(); //we dont actually care about the "type" we wrap here, we just need access to the meta data for the darc
         match self.lock_type {
@@ -232,7 +253,7 @@ impl LamellarAM for LockAm {
             }
         }
         // };
-        // println!("finished lock am");
+        // println!("finished lock am {:?}", self);
     }
 }
 
@@ -256,6 +277,7 @@ impl LamellarAM for UnlockAm {
                 }
             }
         }
+        // println!("Finished in unlock am {:?}", self);
     }
 }
 
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
index 46acb340..983c2e15 100644
--- a/src/darc/handle.rs
+++ b/src/darc/handle.rs
@@ -98,13 +98,15 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
     ///
     ///```
     pub fn block(self) -> LocalRwDarcReadGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalRwDarcReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         let inner_darc = self.darc.darc.clone();
@@ -216,13 +218,15 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
     /// *guard += my_pe;
     ///```
     pub fn block(self) -> LocalRwDarcWriteGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `LocalRwDarcWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         let inner_darc = self.darc.darc.clone();
@@ -329,13 +333,15 @@ impl<T: Sync + Send> GlobalRwDarcReadHandle<T> {
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
     pub fn block(self) -> GlobalRwDarcReadGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarcReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         let _ = self.lock_am.blocking_wait();
@@ -425,13 +431,15 @@ impl<T: Sync + Send> GlobalRwDarcWriteHandle<T> {
     /// *guard += my_pe;
     ///```
     pub fn block(self) -> GlobalRwDarcWriteGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarcWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         let _ = self.lock_am.blocking_wait();
@@ -500,13 +508,15 @@ impl<T: Sync + Send> GlobalRwDarcCollectiveWriteHandle<T> {
     /// let mut guard = handle.block(); //block until we get the write lock
     /// *guard += my_pe;
     pub fn block(self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-        let msg = format!("
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
                 [LAMELLAR WARNING] You are calling `GlobalRwDarcCollectiveWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
                 Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
             );
-        match config().blocking_call_warning {
-            Some(val) if val => println!("{msg}"),
-            _ => println!("{msg}"),
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
         }
 
         let _ = self.lock_am.blocking_wait();
@@ -699,7 +709,7 @@ pub struct IntoLocalRwDarcHandle<T: 'static> {
 }
 
 impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
-    /// Used to drive to conversion of a [Darc] or [GlobalRwDarc] into a [LocalRwDarc] 
+    /// Used to drive to conversion of a [Darc] or [GlobalRwDarc] into a [LocalRwDarc]
     /// # Examples
     ///
     ///```
@@ -709,6 +719,16 @@ impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_localrw = five.into_localrw().block();
     pub fn block(self) -> LocalRwDarc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarcCollectiveWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         self.team.clone().block_on(self)
     }
 }
@@ -775,6 +795,16 @@ impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_globalrw = five.into_globalrw().block();
     pub fn block(self) -> GlobalRwDarc<T> {
+        if std::thread::current().id() != *crate::MAIN_THREAD {
+            let msg = format!("
+                [LAMELLAR WARNING] You are calling `GlobalRwDarcCollectiveWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
+                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
+            );
+            match config().blocking_call_warning {
+                Some(val) if val => println!("{msg}"),
+                _ => println!("{msg}"),
+            }
+        }
         self.team.clone().block_on(self)
     }
 }

From 57a4f31d9902d2131a30a0e935b64dbc3c4172a8 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 22 Oct 2024 13:28:54 -0700
Subject: [PATCH 100/116] refactor warnings, add features to panic, and disable
 completely

---
 Cargo.toml                                    |   2 +
 build.rs                                      |   1 +
 examples/kernels/am_flops.rs                  |  11 +-
 run_examples.sh                               |   2 +-
 src/active_messaging.rs                       |  16 +-
 src/active_messaging/handle.rs                | 136 ++++++--
 src/array.rs                                  |  40 +--
 src/array/atomic.rs                           |   4 +-
 src/array/global_lock_atomic.rs               |   7 +-
 src/array/global_lock_atomic/handle.rs        | 195 ++++++++----
 src/array/handle.rs                           |  16 +-
 .../distributed_iterator/consumer/collect.rs  |   6 +
 .../distributed_iterator/consumer/count.rs    |   6 +
 .../distributed_iterator/consumer/for_each.rs |  14 +-
 .../distributed_iterator/consumer/reduce.rs   |   6 +
 .../distributed_iterator/consumer/sum.rs      |   6 +
 .../local_iterator/consumer/collect.rs        |   6 +
 .../iterator/local_iterator/consumer/count.rs |   6 +
 .../local_iterator/consumer/for_each.rs       |   6 +
 .../local_iterator/consumer/reduce.rs         |   6 +
 .../iterator/local_iterator/consumer/sum.rs   |   6 +
 src/array/iterator/one_sided_iterator.rs      |  10 +-
 src/array/local_lock_atomic.rs                |   2 +-
 src/array/local_lock_atomic/handle.rs         | 237 ++++++++++----
 src/array/local_lock_atomic/iteration.rs      |   5 -
 src/array/local_lock_atomic/local_chunks.rs   |   1 -
 src/array/operations/handle.rs                |  35 +-
 src/array/read_only.rs                        |   4 +-
 src/array/unsafe.rs                           |  52 ++-
 src/array/unsafe/iteration/distributed.rs     |  18 +-
 src/array/unsafe/operations.rs                |  39 +--
 src/barrier.rs                                |  63 ++--
 src/darc.rs                                   |   3 -
 src/darc/global_rw_darc.rs                    |   4 -
 src/darc/handle.rs                            | 300 +++++++++++++-----
 src/darc/local_rw_darc.rs                     |  16 -
 src/env_var.rs                                |  10 +-
 src/lamellar_task_group.rs                    | 211 ++++++++----
 src/lamellar_team.rs                          | 250 +++++++--------
 src/lib.rs                                    |   2 +
 src/scheduler.rs                              |  21 +-
 src/warnings.rs                               |  99 ++++++
 42 files changed, 1263 insertions(+), 617 deletions(-)
 create mode 100644 src/warnings.rs

diff --git a/Cargo.toml b/Cargo.toml
index b316d1dc..0a2740e7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -75,6 +75,8 @@ enable-rofi-shared=["rofi","rofisys/shared","libc"]
 rofi=[]
 tokio-executor=["tokio"]
 slurm-test=[]
+disable-runtime-warnings=[]
+runtime-warnings-panic=[]
 default=[]
 
 
diff --git a/build.rs b/build.rs
index d04f36d0..ae35a6d0 100644
--- a/build.rs
+++ b/build.rs
@@ -4,6 +4,7 @@ use std::env;
 use std::path::PathBuf;
 
 fn main() {
+    println!("cargo:rerun-if-env-changed=DEP_ROFI_ROOT");
     #[cfg(feature = "enable-rofi-shared")]
     {
         if let Ok(rofi_lib_dir) = env::var("DEP_ROFI_ROOT") {
diff --git a/examples/kernels/am_flops.rs b/examples/kernels/am_flops.rs
index 2e975cc8..d1c5bb4b 100644
--- a/examples/kernels/am_flops.rs
+++ b/examples/kernels/am_flops.rs
@@ -98,15 +98,20 @@ fn main() {
         if my_pe == 0 {
             for _j in 0..num_tasks {
                 let sub_timer = Instant::now();
-                reqs.push(world.exec_am_all(FlopAM {
-                    iterations: num_iterations,
-                }));
+                reqs.push(
+                    world
+                        .exec_am_all(FlopAM {
+                            iterations: num_iterations,
+                        })
+                        .spawn(),
+                );
 
                 sub_time += sub_timer.elapsed().as_secs_f64();
             }
             println!("issue time: {:?}", timer.elapsed().as_secs_f64());
             world.wait_all();
         }
+
         world.barrier();
         let cur_t = timer.elapsed().as_secs_f64();
         let tot_flop: usize = reqs
diff --git a/run_examples.sh b/run_examples.sh
index e26e1a6a..2cefa297 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -6,7 +6,7 @@ target_dir=$PWD/target
 output_dir=/home/scratch/$USER
 
 root=$PWD
-. $root/../junction-prep.rc
+# . $root/../junction-prep.rc
 
 local_results_dir=async_backends
 results_dir=${output_dir}/rofiverbs_lamellae/${local_results_dir}
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 81a751d1..23b13690 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -923,6 +923,7 @@ pub(crate) enum RetType {
 #[derive(Debug)]
 pub(crate) struct AMCounters {
     pub(crate) outstanding_reqs: Arc<AtomicUsize>,
+    pub(crate) launched_req_cnt: AtomicUsize,
     pub(crate) send_req_cnt: AtomicUsize,
 }
 
@@ -930,12 +931,21 @@ impl AMCounters {
     pub(crate) fn new() -> AMCounters {
         AMCounters {
             outstanding_reqs: Arc::new(AtomicUsize::new(0)),
+            launched_req_cnt: AtomicUsize::new(0),
             send_req_cnt: AtomicUsize::new(0),
         }
     }
-    pub(crate) fn add_send_req(&self, num: usize) {
-        let _num_reqs = self.outstanding_reqs.fetch_add(num, Ordering::SeqCst);
-        // println!("add_send_req {}",_num_reqs+1);
+
+    pub(crate) fn inc_launched(&self, num: usize) {
+        self.launched_req_cnt.fetch_add(num, Ordering::SeqCst);
+    }
+    pub(crate) fn inc_outstanding(&self, num: usize) {
+        self.outstanding_reqs.fetch_add(num, Ordering::SeqCst);
+    }
+    pub(crate) fn dec_outstanding(&self, num: usize) {
+        self.outstanding_reqs.fetch_sub(num, Ordering::SeqCst);
+    }
+    pub(crate) fn inc_send_req(&self, num: usize) {
         self.send_req_cnt.fetch_add(num, Ordering::SeqCst);
     }
 }
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index c4797ccc..1108751b 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -18,25 +18,27 @@ use crate::{
     lamellar_request::{InternalResult, LamellarRequest, LamellarRequestAddResult},
     memregion::one_sided::MemRegionHandleInner,
     scheduler::{LamellarTask, Scheduler},
+    warnings::RuntimeWarning,
     Darc, LamellarArchRT,
 };
 
-use super::{AmDist, DarcSerde, RemotePtr};
+use super::{AMCounters, Am, AmDist, DarcSerde, RemotePtr};
 
 pub(crate) struct AmHandleInner {
     pub(crate) ready: AtomicBool,
     pub(crate) waker: Mutex<Option<Waker>>,
     pub(crate) data: Cell<Option<InternalResult>>, //we only issue a single request, which the runtime will update, but the user also has a handle so we need a way to mutate
-    pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+    pub(crate) team_counters: Arc<AMCounters>,
+    pub(crate) world_counters: Arc<AMCounters>,
+    pub(crate) tg_counters: Option<Arc<AMCounters>>,
     pub(crate) scheduler: Arc<Scheduler>,
     pub(crate) user_handle: AtomicU8,
 }
 
 impl std::fmt::Debug for AmHandleInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "AmHandleInner {{ ready: {:?}, team_outstanding_reqs: {:?}  world_outstanding_reqs {:?} tg_outstanding_reqs {:?} user_handle{:?} }}", self.ready.load(Ordering::Relaxed),  self.team_outstanding_reqs.load(Ordering::Relaxed), self.world_outstanding_reqs.load(Ordering::Relaxed), self.tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)), self.user_handle.load(Ordering::Relaxed))
+        write!(f, "AmHandleInner {{ ready: {:?}, team_outstanding_reqs: {:?}  world_outstanding_reqs {:?} tg_outstanding_reqs {:?} user_handle{:?} }}", self.ready.load(Ordering::Relaxed),  
+        self.team_counters.outstanding_reqs.load(Ordering::Relaxed), self.world_counters.outstanding_reqs.load(Ordering::Relaxed), self.tg_counters.as_ref().map(|x| x.outstanding_reqs.load(Ordering::Relaxed)), self.user_handle.load(Ordering::Relaxed))
     }
 }
 
@@ -56,26 +58,29 @@ impl LamellarRequestAddResult for AmHandleInner {
         }
     }
     fn update_counters(&self, _sub_id: usize) {
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        self.team_counters.dec_outstanding(1);
+        self.world_counters.dec_outstanding(1);
+        if let Some(tg_counters) = self.tg_counters.clone() {
+            tg_counters.dec_outstanding(1);
         }
     }
 }
-
 /// A handle to an active messaging request that executes on a singe PE
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
-#[must_use = "active messaging handles do nothing unless polled or awaited or 'spawn()' or 'block()' are called"]
+#[must_use = "active messaging handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 pub struct AmHandle<T> {
     pub(crate) inner: Arc<AmHandleInner>,
+    pub(crate) am: Option<(Am, usize)>,
     pub(crate) _phantom: std::marker::PhantomData<T>,
 }
 
 #[pinned_drop]
 impl<T> PinnedDrop for AmHandle<T> {
     fn drop(self: Pin<&mut Self>) {
+        if self.am.is_some() {
+            RuntimeWarning::DroppedHandle("an AmHandle").print();
+        }
         self.inner.user_handle.fetch_sub(1, Ordering::SeqCst);
     }
 }
@@ -124,22 +129,40 @@ impl<T: AmDist> AmHandle<T> {
         }
     }
 
+    fn launch_am_if_needed(&mut self) {
+        if let Some((am, num_pes)) = self.am.take() {
+            self.inner.team_counters.inc_outstanding(num_pes);
+            self.inner.team_counters.inc_launched(num_pes);
+            self.inner.world_counters.inc_outstanding(num_pes);
+            self.inner.world_counters.inc_launched(num_pes);
+            if let Some(tg_counters) = self.inner.tg_counters.clone() {
+                tg_counters.inc_outstanding(num_pes);
+                tg_counters.inc_launched(num_pes);
+            }
+            self.inner.scheduler.submit_am(am);
+        }
+    }
     /// This method will spawn the associated Active Message on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<T> {
+    pub fn spawn(mut self) -> LamellarTask<T> {
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().spawn_task(self)
     }
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> T {
+    pub fn block(mut self) -> T {
+        RuntimeWarning::BlockingCall("AmHandle::block", "<handle>.spawn() or <handle>.await")
+            .print();
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().block_on(self)
     }
 }
 
 impl<T: AmDist> LamellarRequest for AmHandle<T> {
-    fn blocking_wait(self) -> T {
+    fn blocking_wait(mut self) -> T {
+        self.launch_am_if_needed();
         while !self.inner.ready.load(Ordering::SeqCst) {
             self.inner.scheduler.exec_task();
         }
@@ -147,6 +170,7 @@ impl<T: AmDist> LamellarRequest for AmHandle<T> {
     }
 
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.launch_am_if_needed();
         let mut cur_waker = self.inner.waker.lock();
         if self.inner.ready.load(Ordering::SeqCst) {
             true
@@ -175,6 +199,7 @@ impl<T: AmDist> LamellarRequest for AmHandle<T> {
 impl<T: AmDist> Future for AmHandle<T> {
     type Output = T;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launch_am_if_needed();
         let mut this = self.as_mut();
         if this.ready_or_set_waker(cx.waker()) {
             Poll::Ready(
@@ -189,15 +214,19 @@ impl<T: AmDist> Future for AmHandle<T> {
 /// A handle to an active messaging request that executes on the local (originating) PE
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
-#[must_use = "active messaging handles do nothing unless polled or awaited or 'spawn()' or 'block()' are called"]
+#[must_use = "active messaging handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 pub struct LocalAmHandle<T> {
     pub(crate) inner: Arc<AmHandleInner>,
+    pub(crate) am: Option<(Am, usize)>,
     pub(crate) _phantom: std::marker::PhantomData<T>,
 }
 
 #[pinned_drop]
 impl<T> PinnedDrop for LocalAmHandle<T> {
     fn drop(self: Pin<&mut Self>) {
+        if self.am.is_some() {
+            RuntimeWarning::DroppedHandle("a LocalAmHandle").print();
+        }
         self.inner.user_handle.fetch_sub(1, Ordering::SeqCst);
     }
 }
@@ -224,6 +253,19 @@ impl<T: 'static> LocalAmHandle<T> {
             }
         }
     }
+    fn launch_am_if_needed(&mut self) {
+        if let Some((am, num_pes)) = self.am.take() {
+            self.inner.team_counters.inc_outstanding(num_pes);
+            self.inner.team_counters.inc_launched(num_pes);
+            self.inner.world_counters.inc_outstanding(num_pes);
+            self.inner.world_counters.inc_launched(num_pes);
+            if let Some(tg_counters) = self.inner.tg_counters.clone() {
+                tg_counters.inc_outstanding(num_pes);
+                tg_counters.inc_launched(num_pes);
+            }
+            self.inner.scheduler.submit_am(am);
+        }
+    }
 }
 
 impl<T: Send + 'static> LocalAmHandle<T> {
@@ -232,27 +274,33 @@ impl<T: Send + 'static> LocalAmHandle<T> {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<T> {
+    pub fn spawn(mut self) -> LamellarTask<T> {
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().spawn_task(self)
     }
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> T {
+    pub fn block(mut self) -> T {
+        RuntimeWarning::BlockingCall("LocalAmHandle::block", "<handle>.spawn() or <handle>.await")
+            .print();
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().block_on(self)
     }
 }
 
 impl<T: AmDist> From<LocalAmHandle<T>> for AmHandle<T> {
-    fn from(x: LocalAmHandle<T>) -> Self {
+    fn from(mut x: LocalAmHandle<T>) -> Self {
         x.inner.user_handle.fetch_add(1, Ordering::SeqCst);
         Self {
             inner: x.inner.clone(),
+            am: x.am.take(),
             _phantom: std::marker::PhantomData,
         }
     }
 }
 
 impl<T: 'static> LamellarRequest for LocalAmHandle<T> {
-    fn blocking_wait(self) -> T {
+    fn blocking_wait(mut self) -> T {
+        self.launch_am_if_needed();
         while !self.inner.ready.load(Ordering::SeqCst) {
             self.inner.scheduler.exec_task();
         }
@@ -261,6 +309,7 @@ impl<T: 'static> LamellarRequest for LocalAmHandle<T> {
     }
 
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.launch_am_if_needed();
         let mut cur_waker = self.inner.waker.lock();
         if self.inner.ready.load(Ordering::SeqCst) {
             true
@@ -290,6 +339,7 @@ impl<T: 'static> LamellarRequest for LocalAmHandle<T> {
 impl<T: 'static> Future for LocalAmHandle<T> {
     type Output = T;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launch_am_if_needed();
         let mut this = self.as_mut();
         if this.ready_or_set_waker(cx.waker()) {
             Poll::Ready(
@@ -307,9 +357,9 @@ pub(crate) struct MultiAmHandleInner {
     pub(crate) arch: Arc<LamellarArchRT>,
     pub(crate) data: Mutex<HashMap<usize, InternalResult>>,
     pub(crate) waker: Mutex<Option<Waker>>,
-    pub(crate) team_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) world_outstanding_reqs: Arc<AtomicUsize>,
-    pub(crate) tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+    pub(crate) team_counters: Arc<AMCounters>,
+    pub(crate) world_counters: Arc<AMCounters>,
+    pub(crate) tg_counters: Option<Arc<AMCounters>>,
     pub(crate) scheduler: Arc<Scheduler>,
     pub(crate) user_handle: AtomicU8, //we can use this flag to optimize what happens when the request returns
 }
@@ -317,15 +367,19 @@ pub(crate) struct MultiAmHandleInner {
 /// A handle to an active messaging request that executes on multiple PEs, returned from a call to [exec_am_all][crate::ActiveMessaging::exec_am_all]
 #[derive(Debug)]
 #[pin_project(PinnedDrop)]
-#[must_use = "active messaging handles do nothing unless polled or awaited or 'spawn()' or 'block()' are called"]
+#[must_use = "active messaging handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 pub struct MultiAmHandle<T> {
     pub(crate) inner: Arc<MultiAmHandleInner>,
+    pub(crate) am: Option<(Am, usize)>,
     pub(crate) _phantom: std::marker::PhantomData<T>,
 }
 
 #[pinned_drop]
 impl<T> PinnedDrop for MultiAmHandle<T> {
     fn drop(self: Pin<&mut Self>) {
+        if self.am.is_some() {
+            RuntimeWarning::DroppedHandle("a MultiAmHandle").print();
+        }
         self.inner.user_handle.fetch_sub(1, Ordering::SeqCst);
     }
 }
@@ -345,10 +399,10 @@ impl LamellarRequestAddResult for MultiAmHandleInner {
         }
     }
     fn update_counters(&self, _sub_id: usize) {
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        self.team_counters.dec_outstanding(1);
+        self.world_counters.dec_outstanding(1);
+        if let Some(tg_counters) = self.tg_counters.clone() {
+            tg_counters.dec_outstanding(1);
         }
     }
 }
@@ -395,22 +449,42 @@ impl<T: AmDist> MultiAmHandle<T> {
             }
         }
     }
+
+    fn launch_am_if_needed(&mut self) {
+        if let Some((am, num_pes)) = self.am.take() {
+            self.inner.team_counters.inc_outstanding(num_pes);
+            self.inner.team_counters.inc_launched(num_pes);
+            self.inner.world_counters.inc_outstanding(num_pes);
+            self.inner.world_counters.inc_launched(num_pes);
+            if let Some(tg_counters) = self.inner.tg_counters.clone() {
+                tg_counters.inc_outstanding(num_pes);
+                tg_counters.inc_launched(num_pes);
+            }
+            self.inner.scheduler.submit_am(am);
+        }
+    }
+
     /// This method will spawn the associated Active Message on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<Vec<T>> {
+    pub fn spawn(mut self) -> LamellarTask<Vec<T>> {
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().spawn_task(self)
     }
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> Vec<T> {
+    pub fn block(mut self) -> Vec<T> {
+        RuntimeWarning::BlockingCall("MultiAmHandle::block", "<handle>.spawn() or <handle>.await")
+            .print();
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().block_on(self)
     }
 }
 
 impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
-    fn blocking_wait(self) -> Self::Output {
+    fn blocking_wait(mut self) -> Self::Output {
+        self.launch_am_if_needed();
         while self.inner.cnt.load(Ordering::SeqCst) > 0 {
             self.inner.scheduler.exec_task();
         }
@@ -424,6 +498,7 @@ impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
     }
 
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.launch_am_if_needed();
         let mut cur_waker = self.inner.waker.lock();
         if self.inner.cnt.load(Ordering::SeqCst) == 0 {
             true
@@ -457,6 +532,7 @@ impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
 impl<T: AmDist> Future for MultiAmHandle<T> {
     type Output = Vec<T>;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launch_am_if_needed();
         let mut this = self.as_mut();
         if this.ready_or_set_waker(cx.waker()) {
             let mut res = vec![];
diff --git a/src/array.rs b/src/array.rs
index 383fdbd8..68cb8f00 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -190,9 +190,9 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
-// impl Dist for bool {}
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
+impl Dist for bool {}
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
 
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
@@ -201,20 +201,20 @@ crate::inventory::collect!(ReduceKey);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 // //------------------------------------
 
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-lamellar_impl::generate_ops_for_bool_rt!();
+// lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
@@ -1665,7 +1665,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// let array_clone = array.clone();
 /// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-///     array_clone.add(index,1); //randomly at one to an element in the array.
+///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
 /// }).block();
 /// let sum = array.block_on(array.sum()).expect("array len > 0"); // atomic updates still possibly happening, output non deterministic
 /// println!("sum {sum}");
@@ -1679,7 +1679,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// let array_clone = array.clone();
 /// let req = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-///     array_clone.add(index,1); //randomly at one to an element in the array.
+///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
 /// });
 /// array.block_on(req);// this is not sufficient, we also need to "wait_all" as each "add" call is another request
 /// array.wait_all();
@@ -1696,7 +1696,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// let array_clone = array.clone();
 /// let req = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-///     array_clone.add(index,1); //randomly at one to an element in the array.
+///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
 /// });
 /// array.block_on(req);// this is not sufficient, we also need to "wait_all" as each "add" call is another request
 /// array.wait_all();
@@ -1714,7 +1714,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// let array_clone = array.clone();
 /// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-///     array_clone.add(index,1); //randomly at one to an element in the array.
+///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
 /// }).block();
 /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
 /// let sum = array.block_on(array.sum()).expect("array len > 0"); // No updates occuring anywhere anymore so we have a deterministic result
@@ -1778,7 +1778,7 @@ where
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     /// }).block();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
     /// let sum = array.block_on(array.reduce("sum")).expect("array len > 0"); // equivalent to calling array.sum()
@@ -1929,7 +1929,7 @@ where
 /// let array_clone = array.clone();
 /// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-///     array_clone.add(index,1); //randomly at one to an element in the array.
+///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
 /// }).block();
 /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
 /// let sum = array.block_on(array.sum());
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index f404d116..dd75b7d5 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1303,7 +1303,7 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     /// }).block();
     /// world.wait_all();
     /// world.barrier();
@@ -1352,7 +1352,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly add one to an element in the array.
+    ///     let _ = array_clone.add(index,1).spawn(); //randomly add one to an element in the array.
     /// }).block();
     /// world.wait_all();
     /// world.barrier();
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index c7a126c0..1a2f5cb1 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -11,7 +11,6 @@ use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
 use crate::barrier::BarrierHandle;
-use crate::config;
 use crate::darc::global_rw_darc::{
     GlobalRwDarc, GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
 };
@@ -20,6 +19,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use pin_project::pin_project;
 
@@ -1030,6 +1030,11 @@ impl<T: Dist + AmDist> GlobalLockArrayReduceHandle<T> {
 
     /// This method will block the caller until the associated Array Reduce Operation completes
     pub fn block(self) -> Option<T> {
+        RuntimeWarning::BlockingCall(
+            "GlobalLockArrayReduceHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.lock_guard.array.clone().block_on(self)
     }
 }
diff --git a/src/array/global_lock_atomic/handle.rs b/src/array/global_lock_atomic/handle.rs
index f37b7622..823ff0bd 100644
--- a/src/array/global_lock_atomic/handle.rs
+++ b/src/array/global_lock_atomic/handle.rs
@@ -1,10 +1,11 @@
 use std::pin::Pin;
 use std::task::{Context, Poll};
 
-use crate::config;
 use crate::darc::handle::{
     GlobalRwDarcCollectiveWriteHandle, GlobalRwDarcReadHandle, GlobalRwDarcWriteHandle,
 };
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 use crate::Dist;
 use crate::GlobalLockArray;
 
@@ -16,7 +17,7 @@ use super::{
     GlobalLockReadGuard, GlobalLockWriteGuard,
 };
 
-#[must_use]
+#[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired read lock of a GlobalLockArray
 ///
@@ -65,19 +66,34 @@ impl<T: Dist> GlobalLockReadHandle<T> {
     /// let guard = handle.block();
     ///```
     pub fn block(self) -> GlobalLockReadGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalLockReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalLockReadHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_lock();
+    /// let task = handle.spawn(); // initiate getting the read lock
+    /// // do other work
+    /// let guard = task.block();
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<GlobalLockReadGuard<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for GlobalLockReadHandle<T> {
@@ -94,7 +110,7 @@ impl<T: Dist> Future for GlobalLockReadHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired local data [GlobalLockLocalData] of  a GlobalLockArray
 ///
@@ -142,19 +158,35 @@ impl<T: Dist> GlobalLockLocalDataHandle<T> {
     /// println!("local data: {:?}",local_data);
     ///```
     pub fn block(self) -> GlobalLockLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalLockLocalDataHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+    /// This method will spawn the associated active message to capture the lock and data on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_local_data();
+    /// let task = handle.spawn(); // initiate getting the read lock
+    /// // do other work
+    /// let  local_data = task.block();
+    /// println!("local data: {:?}",local_data);
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<GlobalLockLocalData<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for GlobalLockLocalDataHandle<T> {
@@ -174,7 +206,7 @@ impl<T: Dist> Future for GlobalLockLocalDataHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired write lock of a GlobalLockArray
 ///
@@ -220,22 +252,37 @@ impl<T: Dist> GlobalLockWriteHandle<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
     /// let handle = array.write_lock();
-    /// handle.block();
+    /// let guard = handle.block();
     ///```
     pub fn block(self) -> GlobalLockWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalLockWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalLockWriteHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock  on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_lock();
+    /// let task = handle.spawn(); // initiate getting the read lock
+    /// // do other work
+    /// let guard = task.block();
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<GlobalLockWriteGuard<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for GlobalLockWriteHandle<T> {
@@ -252,7 +299,7 @@ impl<T: Dist> Future for GlobalLockWriteHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of  a GlobalLockArray
 ///
@@ -300,19 +347,36 @@ impl<T: Dist> GlobalLockMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn block(self) -> GlobalLockMutLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalLockMutLocalDataHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock and data on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_local_data();
+    /// let task = handle.spawn(); // initiate getting the read lock
+    /// // do other work
+    /// let mut local_data = task.block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<GlobalLockMutLocalData<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for GlobalLockMutLocalDataHandle<T> {
@@ -332,7 +396,7 @@ impl<T: Dist> Future for GlobalLockMutLocalDataHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of a GlobalLockArray with all PEs collectively accessing their local data
 ///
@@ -381,19 +445,36 @@ impl<T: Dist> GlobalLockCollectiveMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn block(self) -> GlobalLockCollectiveMutLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalLockCollectiveMutLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalLockCollectiveMutLocalData::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock and data on the work queue,
+    /// initiating the remote operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.collective_write_local_data();
+    /// let task = handle.spawn(); // initiate getting the read lock
+    /// // do other work
+    /// let mut local_data = task.block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<GlobalLockCollectiveMutLocalData<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for GlobalLockCollectiveMutLocalDataHandle<T> {
diff --git a/src/array/handle.rs b/src/array/handle.rs
index 9dc48795..e0578df3 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -8,11 +8,7 @@ use std::{
 use pin_project::pin_project;
 
 use crate::{
-    active_messaging::{AmHandle, LocalAmHandle},
-    array::LamellarByteArray,
-    lamellar_request::LamellarRequest,
-    scheduler::LamellarTask,
-    Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
+    active_messaging::{AmHandle, LocalAmHandle}, array::LamellarByteArray, lamellar_request::LamellarRequest, scheduler::LamellarTask, warnings::RuntimeWarning, Dist, OneSidedMemoryRegion, RegisteredMemoryRegion
 };
 
 /// a task handle for an array rdma (put/get) operation
@@ -33,6 +29,11 @@ impl ArrayRdmaHandle {
 
     /// This method will block the calling thread until the associated Array RDMA Operation completes
     pub fn block(self) -> () {
+        RuntimeWarning::BlockingCall(
+            "ArrayRdmaHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.array.team().block_on(self)
     }
 }
@@ -90,6 +91,11 @@ impl<T: Dist> ArrayRdmaAtHandle<T> {
 
     /// This method will block the calling thread until the associated Array RDMA at Operation completes
     pub fn block(self) -> T {
+        RuntimeWarning::BlockingCall(
+            "ArrayRdmaAtHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.array.team().block_on(self)
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 01a32f2d..64ce3f02 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -11,6 +11,7 @@ use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
@@ -294,6 +295,11 @@ where
 
     /// This method will block until the associated Collect operation completes and returns the result
     pub fn block(self) -> A {
+        RuntimeWarning::BlockingCall(
+            "DistIterCollectHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
 
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index c8ee207b..6588000c 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -10,6 +10,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 use crate::Darc;
 
 use async_trait::async_trait;
@@ -218,6 +219,11 @@ impl DistIterCountHandle {
 
     /// This method will block until the associated Count operation completes and returns the result
     pub fn block(self) -> usize {
+        RuntimeWarning::BlockingCall(
+            "DistIterCountHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
 
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index 24c3d15e..08df2314 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -8,6 +8,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -218,6 +219,11 @@ impl DistIterForEachHandle {
 
     /// This method will block until the associated For Each operation completes and returns the result
     pub fn block(self) {
+        RuntimeWarning::BlockingCall(
+            "DistIterForEachHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self);
     }
     /// This method will spawn the associated  For Each Operation on the work queue,
@@ -226,14 +232,6 @@ impl DistIterForEachHandle {
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(self) -> LamellarTask<()> {
-        // match self.state {
-        //     State::Barrier(ref barrier, _) => {
-        //         println!("spawning task barrier id {:?}", barrier.barrier_id);
-        //     }
-        //     State::Reqs(_, barrier_id) => {
-        //         println!("spawning task not sure I can be here {:?}", barrier_id);
-        //     }
-        // }
         self.team.clone().scheduler.spawn_task(self)
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index c0359b8f..fb06f9e6 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -10,6 +10,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 use crate::Dist;
 
 use futures_util::{ready, Future, StreamExt};
@@ -339,6 +340,11 @@ where
 
     /// This method will block until the associated Reduce operation completes and returns the result
     pub fn block(self) -> Option<T> {
+        RuntimeWarning::BlockingCall(
+            "DistIterReduceHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
 
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index 75670599..15df895c 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -9,6 +9,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 use crate::Dist;
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -218,6 +219,11 @@ where
 
     /// This method will block until the associated Sum operation completes and returns the result
     pub fn block(self) -> T {
+        RuntimeWarning::BlockingCall(
+            "DistIterSumHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
 
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 3772b1c5..f5111d8f 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -10,6 +10,7 @@ use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
@@ -288,6 +289,11 @@ where
 
     /// This method will block until the associated Collect operation completes and returns the result
     pub fn block(self) -> A {
+        RuntimeWarning::BlockingCall(
+            "LocalIterCollectHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
 
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index 1b3c9092..c8be3627 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -6,6 +6,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -141,6 +142,11 @@ impl LocalIterCountHandle {
 
     /// This method will block until the associated Count operation completes and returns the result
     pub fn block(self) -> usize {
+        RuntimeWarning::BlockingCall(
+            "LocalIterCountHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
 
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index f18c2aa5..88e860c1 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -7,6 +7,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -220,6 +221,11 @@ impl LocalIterForEachHandle {
 
     /// This method will block until the associated For Each operation completes and returns the result
     pub fn block(self) {
+        RuntimeWarning::BlockingCall(
+            "LocalIterForEachHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self);
     }
 
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index ff635fa4..34af94f9 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -7,6 +7,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -168,6 +169,11 @@ where
 
     /// This method will block until the associated Reduce operation completes and returns the result
     pub fn block(self) -> Option<T> {
+        RuntimeWarning::BlockingCall(
+            "LocalIterReduceHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
 
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index 366d76f9..40f2906a 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -7,6 +7,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
 use pin_project::pin_project;
@@ -157,6 +158,11 @@ where
 
     /// This method will block until the associated Sumoperation completes and returns the result
     pub fn block(self) -> T {
+        RuntimeWarning::BlockingCall(
+            "LocalIterSumHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
     /// This method will spawn the associated Sum Operation on the work queue,
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index 5b67edd3..daacc4bc 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -29,6 +29,7 @@ use crate::array::{ArrayRdmaHandle, LamellarArray, LamellarArrayInternalGet};
 use crate::lamellar_request::LamellarRequest;
 use crate::memregion::{Dist, OneSidedMemoryRegion, RegisteredMemoryRegion, SubRegion};
 
+use crate::warnings::RuntimeWarning;
 use crate::LamellarTeamRT;
 
 // use async_trait::async_trait;
@@ -263,13 +264,8 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     where
         Self: Sized + Send,
     {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            println!(
-                "[LAMELLAR WARNING] Trying to convert a lamellar one sided iterator into a standard iterator within a worker thread {:?} self may result in deadlock.
-                 Please use into_stream() instead",
-                std::backtrace::Backtrace::capture()
-            )
-        }
+        RuntimeWarning::BlockingCall("into_iter", "into_stream()").print();
+
         // println!("Into Iter");
         self.init();
         OneSidedIteratorIter { iter: self }
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 43d0a4bf..3efdb517 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -982,7 +982,7 @@ impl<T: Dist + AmDist> LocalLockArrayReduceHandle<T> {
         self.lock_guard.array.clone().spawn(self)
     }
 
-    /// This method will block the caller until the associated Array Reduce Operation completes
+    /// This method will block the caller until the associated Array Reduce Operation completesRuntimeWarning::BlockingCall("LocalLockArrayReduceHandle::block", "<handle>.spawn() or <handle>.await").print();
     pub fn block(self) -> Option<T> {
         self.lock_guard.array.clone().block_on(self)
     }
diff --git a/src/array/local_lock_atomic/handle.rs b/src/array/local_lock_atomic/handle.rs
index 2c42dc5e..e593b228 100644
--- a/src/array/local_lock_atomic/handle.rs
+++ b/src/array/local_lock_atomic/handle.rs
@@ -2,8 +2,9 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
-use crate::config;
 use crate::darc::handle::{LocalRwDarcReadHandle, LocalRwDarcWriteHandle};
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 use crate::Dist;
 use crate::LocalLockArray;
 
@@ -15,7 +16,7 @@ use super::{
     LocalLockReadGuard, LocalLockWriteGuard,
 };
 
-#[must_use]
+#[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired read lock of a LocalLockArray
 ///
@@ -65,18 +66,33 @@ impl<T: Dist> LocalLockReadHandle<T> {
     /// let guard = handle.block();
     ///```
     pub fn block(self) -> LocalLockReadGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalLockReadHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_lock();
+    /// let task = handle.spawn(); // initiate getting the read lock
+    /// // do other work
+    /// let guard = task.block();
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockReadGuard<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for LocalLockReadHandle<T> {
@@ -93,7 +109,7 @@ impl<T: Dist> Future for LocalLockReadHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired local data [LocalLockLocalData] of  a LocalLockArray
 ///
@@ -141,18 +157,35 @@ impl<T: Dist> LocalLockLocalDataHandle<T> {
     /// println!("local data: {:?}",local_data);
     ///```
     pub fn block(self) -> LocalLockLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalLockLocalDataHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock and data on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.read_local_data();
+    /// let task = handle.spawn(); // initiate getting the read lock
+    /// // do other work
+    /// let  local_data = task.block();
+    /// println!("local data: {:?}",local_data);
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockLocalData<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for LocalLockLocalDataHandle<T> {
@@ -172,7 +205,7 @@ impl<T: Dist> Future for LocalLockLocalDataHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired write lock of a LocalLockArray
 ///
@@ -221,19 +254,34 @@ impl<T: Dist> LocalLockWriteHandle<T> {
     /// handle.block();
     ///```
     pub fn block(self) -> LocalLockWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalLockWriteHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_lock();
+    /// let task = handle.spawn(); // initiate getting the write lock
+    /// //do other work
+    /// task.block();
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockWriteGuard<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for LocalLockWriteHandle<T> {
@@ -250,7 +298,7 @@ impl<T: Dist> Future for LocalLockWriteHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired mutable local data [LocalLockMutLocalData] of  a LocalLockArray
 ///
@@ -298,19 +346,36 @@ impl<T: Dist> LocalLockMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     pub fn block(self) -> LocalLockMutLocalData<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockLocalDataHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalLockMutLocalDataHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock and data on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    ///
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let handle = array.write_local_data();
+    /// let task = handle.spawn(); // initiate getting the write lock
+    /// //do other work
+    /// let mut local_data = task.block();
+    /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockMutLocalData<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for LocalLockMutLocalDataHandle<T> {
@@ -330,7 +395,7 @@ impl<T: Dist> Future for LocalLockMutLocalDataHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Constructs a handle for immutably iterating over fixed sized chunks(slices) of the local data of this array.
 /// This handle must be either await'd in an async context or block'd in an non-async context.
@@ -383,19 +448,37 @@ impl<T: Dist> LocalLockLocalChunksHandle<T> {
     /// }).block();
     ///```
     pub fn block(self) -> LocalLockLocalChunks<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockLocalChunksHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalLockLocalChunksHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock and data on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    /// //block in a non-async context
+    /// let iter_task = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
+    ///     println!("PE: {my_pe} i: {i} chunk: {chunk:?}");
+    /// }).spawn();//initiate the iteration
+    /// // do other work
+    /// iter_task.block();
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockLocalChunks<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for LocalLockLocalChunksHandle<T> {
@@ -415,7 +498,7 @@ impl<T: Dist> Future for LocalLockLocalChunksHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// A handle for mutably iterating over fixed sized chunks(slices) of the local data of this array.
 /// This handle must be either await'd in an async context or block'd in an non-async context.
@@ -470,19 +553,39 @@ impl<T: Dist> LocalLockLocalChunksMutHandle<T> {
     ///     }).block();
     ///```
     pub fn block(self) -> LocalLockLocalChunksMut<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalLockLocalChunksMutHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalLockLocalChunksMutHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         self.array.lock.darc.team().scheduler.block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock and data on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let my_pe = world.my_pe();
+    /// //block in a non-async context
+    /// let iter_task = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
+    ///         for elem in chunk.iter_mut() {
+    ///             *elem = i;
+    ///         }
+    ///     }).spawn(); // initiate the iteration
+    /// // do other work
+    /// iter_task.block();
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockLocalChunksMut<T>> {
+        self.array.lock.darc.team().spawn(self)
+    }
 }
 
 impl<T: Dist> Future for LocalLockLocalChunksMutHandle<T> {
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 918e2323..f57826b1 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -8,11 +8,6 @@ use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::darc::local_rw_darc::LocalRwDarcWriteGuard;
 use crate::memregion::Dist;
-// use parking_lot::{
-//     lock_api::{ArcRwLockReadGuard, ArcRwLockWriteGuard},
-//     RawRwLock,
-// };
-use async_lock::{RwLockReadGuardArc, RwLockWriteGuardArc};
 
 impl<T> InnerArray for LocalLockArray<T> {
     fn as_inner(&self) -> &UnsafeArrayInner {
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index fe7e627c..15830a95 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -2,7 +2,6 @@ use crate::array::iterator::local_iterator::{IndexedLocalIterator, LocalIterator
 use crate::array::iterator::private::*;
 use crate::array::local_lock_atomic::*;
 use crate::array::LamellarArray;
-use crate::config;
 use crate::darc::local_rw_darc::{LocalRwDarcReadGuard, LocalRwDarcWriteGuard};
 use crate::memregion::Dist;
 
diff --git a/src/array/operations/handle.rs b/src/array/operations/handle.rs
index 97926f8b..db4a49e0 100644
--- a/src/array/operations/handle.rs
+++ b/src/array/operations/handle.rs
@@ -1,8 +1,5 @@
 use crate::{
-    array::{AmDist, LamellarByteArray},
-    lamellar_request::LamellarRequest,
-    scheduler::LamellarTask,
-    AmHandle,
+    array::{AmDist, LamellarByteArray}, lamellar_request::LamellarRequest, scheduler::LamellarTask, warnings::RuntimeWarning, AmHandle
 };
 
 use std::{
@@ -15,6 +12,7 @@ use std::{
 use pin_project::pin_project;
 
 /// a task handle for a batched array operation that doesnt return any values
+#[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayBatchOpHandle {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<(AmHandle<()>, Vec<usize>)>,
@@ -33,6 +31,11 @@ impl ArrayBatchOpHandle {
     }
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(self) -> () {
+        RuntimeWarning::BlockingCall(
+            "ArrayBatchOpHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.array.team().block_on(self)
     }
 }
@@ -71,6 +74,7 @@ impl Future for ArrayBatchOpHandle {
 }
 
 /// a task handle for a single array operation that returns a value
+#[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayFetchOpHandle<R: AmDist> {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: AmHandle<Vec<R>>,
@@ -88,6 +92,11 @@ impl<R: AmDist> ArrayFetchOpHandle<R> {
 
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(self) -> R {
+        RuntimeWarning::BlockingCall(
+            "ArrayFetchOpHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.array.team().block_on(self)
     }
 }
@@ -119,6 +128,7 @@ impl<R: AmDist> Future for ArrayFetchOpHandle<R> {
 
 /// a task handle for a batched array operation that return values
 #[pin_project]
+#[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayFetchBatchOpHandle<R: AmDist> {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>,
@@ -137,6 +147,11 @@ impl<R: AmDist> ArrayFetchBatchOpHandle<R> {
 
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(self) -> Vec<R> {
+        RuntimeWarning::BlockingCall(
+            "ArrayFetchBatchOpHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.array.team().block_on(self)
     }
 }
@@ -220,6 +235,7 @@ impl<R: AmDist> Future for ArrayFetchBatchOpHandle<R> {
 }
 
 /// a task handle for a single array operation that returns a result
+#[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayResultOpHandle<R: AmDist> {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: AmHandle<Vec<Result<R, R>>>,
@@ -237,6 +253,11 @@ impl<R: AmDist> ArrayResultOpHandle<R> {
 
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(self) -> Result<R, R> {
+        RuntimeWarning::BlockingCall(
+            "ArrayResultOpHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.array.team().block_on(self)
     }
 }
@@ -268,6 +289,7 @@ impl<R: AmDist> Future for ArrayResultOpHandle<R> {
 
 /// a task handle for a batched array operation that returns results
 #[pin_project]
+#[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayResultBatchOpHandle<R: AmDist> {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<(AmHandle<Vec<Result<R, R>>>, Vec<usize>)>,
@@ -286,6 +308,11 @@ impl<R: AmDist> ArrayResultBatchOpHandle<R> {
 
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(self) -> Vec<Result<R, R>> {
+        RuntimeWarning::BlockingCall(
+            "ArrayResultBatchOpHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.array.team().block_on(self)
     }
 }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 61af4425..56dc8935 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -458,7 +458,7 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     /// }).block();
     /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
@@ -491,7 +491,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///     array_clone.add(index,1); //randomly at one to an element in the array.
+    ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     /// }).block();
     /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 26e2217c..581aacd6 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -18,6 +18,7 @@ use crate::lamellae::AllocationType;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::{Dist, MemoryRegion};
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 use crate::LamellarTaskGroup;
 
 use core::marker::PhantomData;
@@ -521,6 +522,24 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     }
 
     pub(crate) async fn await_all(&self) {
+        if self
+            .inner
+            .data
+            .array_counters
+            .send_req_cnt
+            .load(Ordering::SeqCst)
+            != self
+                .inner
+                .data
+                .array_counters
+                .launched_req_cnt
+                .load(Ordering::SeqCst)
+        {
+            RuntimeWarning::UnspanedTask(
+                "`await_all` on an array before all operations, iterators, etc, created by the array have been spawned",
+            )
+            .print();
+        }
         let mut temp_now = Instant::now();
         // let mut first = true;
         while self
@@ -832,17 +851,6 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T
 
 impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `Array::team_from` from within an async context which may lead to deadlock, this is unintended and likely a Runtime bug.
-                Please open a github issue at https://github.com/pnnl/lamellar-runtime/issues including a backtrace if possible.
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
         let (local_vals, distribution) = input;
         // println!("local_vals len: {:?}", local_vals.len());
         team.tasking_barrier();
@@ -1037,6 +1045,24 @@ impl<T: Dist> ActiveMessaging for UnsafeArray<T> {
             .exec_am_local_tg(am, Some(self.team_counters()))
     }
     fn wait_all(&self) {
+        if self
+            .inner
+            .data
+            .array_counters
+            .send_req_cnt
+            .load(Ordering::SeqCst)
+            != self
+                .inner
+                .data
+                .array_counters
+                .launched_req_cnt
+                .load(Ordering::SeqCst)
+        {
+            RuntimeWarning::UnspanedTask(
+                "`wait_all` on an array before all operations, iterators, etc, created by the array have been spawned",
+            )
+            .print();
+        }
         let mut temp_now = Instant::now();
         // let mut first = true;
         while self
@@ -1371,7 +1397,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
     ///     let req = array.local_iter().for_each(move |_| {
     ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///        array_clone.add(index,1); //randomly at one to an element in the array.
+    ///         let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     ///     });
     /// }
     /// array.wait_all();
@@ -1409,7 +1435,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
     ///     let req = array.local_iter().for_each(move |_| {
     ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
-    ///        array_clone.add(index,1); //randomly at one to an element in the array.
+    ///         let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     ///     });
     /// }
     /// array.wait_all();
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 69d4df74..0f8c054b 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -11,7 +11,6 @@ use core::marker::PhantomData;
 use futures_util::Future;
 use paste::paste;
 use std::pin::Pin;
-use std::sync::atomic::Ordering;
 use std::sync::Arc;
 
 impl<T> InnerArray for UnsafeArray<T> {
@@ -50,9 +49,9 @@ macro_rules! consumer_impl {
             {
                 let am = $($am)*;
                 // set req counters so that wait all works
-                self.data.team.team_counters.add_send_req(1);
-                self.data.team.world_counters.add_send_req(1);
-                self.data.task_group.counters.add_send_req(1);
+                self.data.team.team_counters.inc_send_req(1);
+                self.data.team.world_counters.inc_send_req(1);
+                self.data.task_group.counters.inc_send_req(1);
 
                 // self.data.team.scheduler.print_status();
                 let barrier = self.barrier_handle();
@@ -70,9 +69,14 @@ macro_rules! consumer_impl {
                         Schedule::WorkStealing => inner.sched_work_stealing(am),
                     };
                     // remove req counters after individual ams have been launched.
-                    inner.data.team.team_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
-                    inner.data.team.world_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
-                    inner.data.task_group.counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+                    // inner.data.team.team_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+                    // inner.data.team.world_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+                    // inner.data.task_group.counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
+
+                    // increment launch counters to match req countersk
+                    inner.data.team.team_counters.inc_launched(1);
+                    inner.data.team.world_counters.inc_launched(1);
+                    inner.data.task_group.counters.inc_launched(1);
                     // println!("barrier id {:?} done with dist iter sched {:?} {:?} {:?}",barrier_id,inner.data.team.team_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.team.world_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.task_group.counters.outstanding_reqs.load(Ordering::SeqCst));
                     reqs
                 });
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 7a67766f..4e41cf6e 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -502,8 +502,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let futures2 = futures.clone();
             let byte_array2 = byte_array.clone();
             let len = index.len();
-            self.inner.data.array_counters.add_send_req(1);
-            self.inner.data.team.inc_counters(1);
+            self.inner.data.array_counters.inc_outstanding(1);
+            self.inner.data.team.inc_outstanding(1);
             let index_vec = index.to_vec();
             let the_array: UnsafeArray<T> = self.clone();
             self.inner
@@ -573,13 +573,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     }
                     futures2.lock().extend(reqs);
                     cnt2.fetch_add(1, Ordering::SeqCst);
-                    the_array
-                        .inner
-                        .data
-                        .array_counters
-                        .outstanding_reqs
-                        .fetch_sub(1, Ordering::SeqCst);
-                    the_array.inner.data.team.dec_counters(1);
+                    the_array.inner.data.array_counters.dec_outstanding(1);
+                    the_array.inner.data.team.dec_outstanding(1);
                 });
             start_i += len;
         }
@@ -623,8 +618,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let futures2 = futures.clone();
             let byte_array2 = byte_array.clone();
             let len = val.len();
-            self.inner.data.array_counters.add_send_req(1);
-            self.inner.data.team.inc_counters(1);
+            self.inner.data.array_counters.inc_outstanding(1);
+            self.inner.data.team.inc_outstanding(1);
             let the_array: UnsafeArray<T> = self.clone();
             let val_chunks = val.into_vec_chunks(num_per_batch);
             scheduler.submit_immediate_task(async move {
@@ -651,13 +646,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                 });
                 futures2.lock().extend(reqs);
                 cnt2.fetch_add(1, Ordering::SeqCst);
-                the_array
-                    .inner
-                    .data
-                    .array_counters
-                    .outstanding_reqs
-                    .fetch_sub(1, Ordering::SeqCst);
-                the_array.inner.data.team.dec_counters(1);
+                the_array.inner.data.array_counters.dec_outstanding(1);
+                the_array.inner.data.team.dec_outstanding(1);
             });
             start_i += len;
         }
@@ -703,8 +693,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
             let futures2 = futures.clone();
             let byte_array2 = byte_array.clone();
             let len = index.len();
-            self.inner.data.array_counters.add_send_req(1);
-            self.inner.data.team.inc_counters(1);
+            self.inner.data.array_counters.inc_outstanding(1);
+            self.inner.data.team.inc_outstanding(1);
             let index_vec = index.to_vec();
             let vals_vec = val.to_vec();
             let the_array: UnsafeArray<T> = self.clone();
@@ -807,13 +797,8 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
                     }
                     futures2.lock().extend(reqs);
                     cnt2.fetch_add(1, Ordering::SeqCst);
-                    the_array
-                        .inner
-                        .data
-                        .array_counters
-                        .outstanding_reqs
-                        .fetch_sub(1, Ordering::SeqCst);
-                    the_array.inner.data.team.dec_counters(1);
+                    the_array.inner.data.array_counters.dec_outstanding(1);
+                    the_array.inner.data.team.dec_outstanding(1);
                 });
             start_i += len;
         }
diff --git a/src/barrier.rs b/src/barrier.rs
index 1b88b2f6..6aa157fe 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -4,6 +4,7 @@ use crate::lamellar_arch::LamellarArchRT;
 use crate::lamellar_request::LamellarRequest;
 use crate::memregion::MemoryRegion;
 use crate::scheduler::Scheduler;
+use crate::warnings::RuntimeWarning;
 
 use futures_util::Future;
 use pin_project::pin_project;
@@ -132,38 +133,28 @@ impl Barrier {
         recv_pe: usize,
         send_buf_slice: &[usize],
     ) {
-        if s.elapsed().as_secs_f64() > config().deadlock_timeout {
-            println!("[LAMELLAR WARNING][{:?}] Potential deadlock detected.\n\
-            Barrier is a collective operation requiring all PEs associated with the distributed object to enter the barrier call.\n\
-            Please refer to https://docs.rs/lamellar/latest/lamellar/index.html?search=barrier for more information\n\
-            Note that barriers are often called internally for many collective operations, including constructing new LamellarTeams, LamellarArrays, and Darcs, as well as distributed iteration\n\
-            You may be seeing this message if you have called barrier within an async context (meaning it was executed on a worker thread).\n\
-            A full list of collective operations is found at https://docs.rs/lamellar/latest/lamellar/index.html?search=collective\n\
-            The deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds\n\
-            To view backtrace set RUST_LIB_BACKTRACE=1\n\
-        {}",
-        std::thread::current().id()
-        ,config().deadlock_timeout,std::backtrace::Backtrace::capture());
+        RuntimeWarning::BarrierTimeout(s.elapsed().as_secs_f64()).print();
 
-            println!(
-                "[{:?}][{:?}, {:?}] round: {:?} i: {:?} teamsend_pe: {:?} team_recv_pe: {:?} recv_pe: {:?} id: {:?} buf {:?}",
-                std::thread::current().id(),
-                self.my_pe,
-                my_index,
-                round,
-                i,
-                (my_index + i * (self.n + 1).pow(round as u32))
-                    % self.num_pes,
-                team_recv_pe,
-                recv_pe,
-                send_buf_slice,
-                    unsafe {
-                        self.barrier_buf[i - 1]
-                            .as_mut_slice()
-                            .expect("Data should exist on PE")
-                    }
-            );
-            self.print_bar();
+        if s.elapsed().as_secs_f64() > config().deadlock_timeout {
+            // println!(
+            //     "[{:?}][{:?}, {:?}] round: {:?} i: {:?} teamsend_pe: {:?} team_recv_pe: {:?} recv_pe: {:?} id: {:?} buf {:?}",
+            //     std::thread::current().id(),
+            //     self.my_pe,
+            //     my_index,
+            //     round,
+            //     i,
+            //     (my_index + i * (self.n + 1).pow(round as u32))
+            //         % self.num_pes,
+            //     team_recv_pe,
+            //     recv_pe,
+            //     send_buf_slice,
+            //         unsafe {
+            //             self.barrier_buf[i - 1]
+            //                 .as_mut_slice()
+            //                 .expect("Data should exist on PE")
+            //         }
+            // );
+            // self.print_bar();
             *s = Instant::now();
         }
     }
@@ -290,15 +281,7 @@ impl Barrier {
                 self.scheduler.exec_task();
             });
         } else {
-            if let Some(val) = config().blocking_call_warning {
-                // std::env::var("LAMELLAR_BLOCKING_CALL_WARNING") {
-                // if val != "0" && val != "false" && val != "no" && val != "off" {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling barrier from within an async context, this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning");
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling barrier from within an async context), this is experimental and may result in deadlock! Using 'async_barrier().await;' is likely a better choice. Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning");
-            }
+            RuntimeWarning::BlockingCall("barrier", "async_barrier().await").print();
             self.tasking_barrier()
         }
     }
diff --git a/src/darc.rs b/src/darc.rs
index 0bcd1d60..4d58026d 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -44,7 +44,6 @@
 //!     assert_eq!(darc_counter.load(Ordering::SeqCst),num_pes+my_pe); //NOTE: the value of darc_counter will be different on each PE
 //! }
 ///```
-use async_lock::RwLock;
 use core::marker::PhantomData;
 use futures_util::future::join_all;
 use serde::{Deserialize, Deserializer};
@@ -75,7 +74,6 @@ pub(crate) mod local_rw_darc;
 pub use local_rw_darc::LocalRwDarc;
 
 pub(crate) mod global_rw_darc;
-use global_rw_darc::DistRwLock;
 pub use global_rw_darc::GlobalRwDarc;
 
 use self::handle::{IntoGlobalRwDarcHandle, IntoLocalRwDarcHandle};
@@ -1399,7 +1397,6 @@ impl<T> Darc<T> {
     /// let five_as_localdarc = world.block_on(async move {five.into_localrw().await});
     /// ```
     pub fn into_localrw(self) -> IntoLocalRwDarcHandle<T> {
-        
         let wrapped_inner = WrappedInner {
             inner: NonNull::new(self.inner as *mut DarcInner<T>).expect("invalid darc pointer"),
         };
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index fb4d3d38..01346a73 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -1,4 +1,3 @@
-use async_lock::RwLock;
 use core::marker::PhantomData;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use std::fmt;
@@ -8,11 +7,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
 use crate::active_messaging::RemotePtr;
-use crate::config;
-use crate::darc::local_rw_darc::LocalRwDarc;
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
-use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
index 983c2e15..129ee169 100644
--- a/src/darc/handle.rs
+++ b/src/darc/handle.rs
@@ -6,8 +6,10 @@ use std::task::{Context, Poll};
 
 use crate::darc::local_rw_darc::{LocalRwDarc, LocalRwDarcReadGuard};
 use crate::lamellar_request::LamellarRequest;
-use crate::{config, darc, GlobalRwDarc, LamellarTeamRT};
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 use crate::{AmHandle, Darc};
+use crate::{GlobalRwDarc, LamellarTeamRT};
 
 use async_lock::{RwLock, RwLockReadGuardArc, RwLockWriteGuardArc};
 use futures_util::{ready, Future};
@@ -26,7 +28,7 @@ enum State<T> {
     TryingWrite(#[pin] Pin<Box<dyn Future<Output = RwLockWriteGuardArc<T>> + Send + 'static>>),
 }
 
-#[must_use]
+#[must_use = "LocalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired read lock from a LocalRwDarc
 ///
@@ -98,16 +100,11 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
     ///
     ///```
     pub fn block(self) -> LocalRwDarcReadGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalRwDarcReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalRwDarcReadHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         let inner_darc = self.darc.darc.clone();
 
@@ -122,6 +119,30 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
             lock: guard,
         }
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.read();
+    /// let task = handle.spawn(); //initiate the operation
+    /// // do other work
+    /// let guard = task.block(); //block until we get the read lock
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<LocalRwDarcReadGuard<T>> {
+        self.darc.darc.team().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
@@ -148,7 +169,7 @@ impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "LocalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired write lock from a LocalRwDarc
 ///
@@ -218,16 +239,11 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
     /// *guard += my_pe;
     ///```
     pub fn block(self) -> LocalRwDarcWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `LocalRwDarcWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "LocalRwDarcWriteHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         let inner_darc = self.darc.darc.clone();
 
@@ -242,6 +258,29 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
             lock: guard,
         }
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.write();
+    /// let task = handle.spawn(); //initiate the operation
+    /// // do other work
+    /// let mut guard = task.block(); //block until we get the write lock
+    /// *guard += my_pe;
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<LocalRwDarcWriteGuard<T>> {
+        self.darc.darc.team().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for LocalRwDarcWriteHandle<T> {
@@ -268,7 +307,7 @@ impl<T: Sync + Send> Future for LocalRwDarcWriteHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "GlobalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired read lock from a GlobalRwDarc
 ///
@@ -333,16 +372,11 @@ impl<T: Sync + Send> GlobalRwDarcReadHandle<T> {
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///```
     pub fn block(self) -> GlobalRwDarcReadGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarcReadHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalRwDarcReadHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         let _ = self.lock_am.blocking_wait();
         GlobalRwDarcReadGuard {
@@ -351,6 +385,29 @@ impl<T: Sync + Send> GlobalRwDarcReadHandle<T> {
             local_cnt: Arc::new(AtomicUsize::new(1)),
         }
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.read();
+    /// let task = handle.spawn(); //initiate the operation
+    /// // do other work
+    /// let guard = task.block(); //block until we get the write lock
+    /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<GlobalRwDarcReadGuard<T>> {
+        self.darc.darc.team().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for GlobalRwDarcReadHandle<T> {
@@ -366,7 +423,7 @@ impl<T: Sync + Send> Future for GlobalRwDarcReadHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "GlobalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired write lock from a GlobalRwDarc
 ///
@@ -431,16 +488,11 @@ impl<T: Sync + Send> GlobalRwDarcWriteHandle<T> {
     /// *guard += my_pe;
     ///```
     pub fn block(self) -> GlobalRwDarcWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarcWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalRwDarcWriteHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         let _ = self.lock_am.blocking_wait();
         GlobalRwDarcWriteGuard {
@@ -448,6 +500,29 @@ impl<T: Sync + Send> GlobalRwDarcWriteHandle<T> {
             marker: PhantomData,
         }
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.write();
+    /// let task = handle.spawn(); //initiate the operation
+    /// // do other work
+    /// let mut guard = task.block(); //block until we get the write lock
+    /// *guard += my_pe;
+    ///```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<GlobalRwDarcWriteGuard<T>> {
+        self.darc.darc.team().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for GlobalRwDarcWriteHandle<T> {
@@ -462,7 +537,7 @@ impl<T: Sync + Send> Future for GlobalRwDarcWriteHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = "GlobalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 /// Handle used to retrieve the aquired collective write lock from a GlobalRwDarc
 ///
@@ -508,16 +583,11 @@ impl<T: Sync + Send> GlobalRwDarcCollectiveWriteHandle<T> {
     /// let mut guard = handle.block(); //block until we get the write lock
     /// *guard += my_pe;
     pub fn block(self) -> GlobalRwDarcCollectiveWriteGuard<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarcCollectiveWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "GlobalRwDarcCollectiveWriteHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
 
         let _ = self.lock_am.blocking_wait();
         GlobalRwDarcCollectiveWriteGuard {
@@ -526,6 +596,28 @@ impl<T: Sync + Send> GlobalRwDarcCollectiveWriteHandle<T> {
             marker: PhantomData,
         }
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let my_pe = world.my_pe();
+    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let handle = counter.collective_write();
+    /// let task = handle.spawn();//initiate the operation
+    /// // do other work
+    /// let mut guard = task.block(); //block until we get the write lock
+    /// *guard += my_pe;
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<GlobalRwDarcCollectiveWriteGuard<T>> {
+        self.darc.darc.team().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for GlobalRwDarcCollectiveWriteHandle<T> {
@@ -609,7 +701,7 @@ impl<T: 'static> OrigDarc<T> {
     }
 }
 
-#[must_use]
+#[must_use = " Darc 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 #[doc(alias = "Collective")]
 /// This is a handle representing the operation of changing from a [LocalRwDarc] or [GlobalRwDarc] into a regular [Darc].
@@ -654,8 +746,31 @@ impl<T: Sync + Send> IntoDarcHandle<T> {
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_darc = five.into_darc().block();
     pub fn block(self) -> Darc<T> {
+        RuntimeWarning::BlockingCall(
+            "IntoDarcHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_darc_task = five.into_darc().spawn();
+    /// let five_as_darc = five_as_darc_task.block();
+    /// ```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<Darc<T>> {
+        self.team.clone().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for IntoDarcHandle<T> {
@@ -674,7 +789,7 @@ impl<T: Sync + Send> Future for IntoDarcHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = " Darc 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 #[doc(alias = "Collective")]
 /// This is a handle representing the operation of changing from a [Darc] or [GlobalRwDarc] into a [LocalRwDarc].
@@ -719,18 +834,32 @@ impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_localrw = five.into_localrw().block();
     pub fn block(self) -> LocalRwDarc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarcCollectiveWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "IntoLocalRwDarcHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    /// ```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_localrw_task = five.into_localrw().spawn();
+    /// let five_as_localrw = five_as_localrw_task.block();
+    /// ```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<LocalRwDarc<T>> {
+        self.team.clone().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
@@ -750,7 +879,7 @@ impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
     }
 }
 
-#[must_use]
+#[must_use = " Darc 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project]
 #[doc(alias = "Collective")]
 /// This is a handle representing the operation of changing from a [Darc] or [LocalRwDarc] into a [GlobalRwDarc].
@@ -795,18 +924,31 @@ impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_globalrw = five.into_globalrw().block();
     pub fn block(self) -> GlobalRwDarc<T> {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            let msg = format!("
-                [LAMELLAR WARNING] You are calling `GlobalRwDarcCollectiveWriteHandle::block` from within an async context which may lead to deadlock, it is recommended that you use `.await;` instead!
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture()
-            );
-            match config().blocking_call_warning {
-                Some(val) if val => println!("{msg}"),
-                _ => println!("{msg}"),
-            }
-        }
+        RuntimeWarning::BlockingCall(
+            "IntoGlobalRwDarcHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
         self.team.clone().block_on(self)
     }
+
+    /// This method will spawn the associated active message to capture the lock on the work queue,
+    /// initiating the operation.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five_as_globalrw_task = five.into_globalrw().spawn();
+    /// let five_as_globalrw = five_as_globalrw_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(self) -> LamellarTask<GlobalRwDarc<T>> {
+        self.team.clone().spawn(self)
+    }
 }
 
 impl<T: Sync + Send> Future for IntoGlobalRwDarcHandle<T> {
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index a315d9e4..16326798 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -1,30 +1,14 @@
-use async_lock::futures::ReadArc;
-// use parking_lot::{
-//     lock_api::{ArcRwLockReadGuard, RwLockWriteGuardArc},
-//     RawRwLock, RwLock,
-// };
 use async_lock::{RwLock, RwLockReadGuardArc, RwLockWriteGuardArc};
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use std::fmt;
-use std::marker::PhantomData;
 use std::ptr::NonNull;
 use std::sync::atomic::Ordering;
 use std::sync::Arc;
-use std::{
-    future::Future,
-    pin::Pin,
-    task::{Context, Poll, Waker},
-};
-
-use pin_project::pin_project;
 
 use crate::active_messaging::RemotePtr;
-use crate::config;
-use crate::darc::global_rw_darc::{DistRwLock, GlobalRwDarc};
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
 use crate::lamellar_team::IntoLamellarTeam;
-use crate::scheduler::LamellarTask;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 use super::handle::{
diff --git a/src/env_var.rs b/src/env_var.rs
index 5e785336..c5ce2939 100644
--- a/src/env_var.rs
+++ b/src/env_var.rs
@@ -27,9 +27,11 @@
 //!           This can be a fairly expensive operation (as the operation is synchronous across all PEs) so the runtime
 //!           will print a message at the end of execution with how many additional pools were allocated.
 //!              - if you find you are dynamically allocating new memory pools, try setting `LAMELLAR_HEAP_SIZE` to a larger value
-//! - `LAMELLAR_DEADLOCK_WARNING_TIMEOUT` - the timeout in seconds before a deadlock warning is printed. Defaults to 600. Note this does not cause your application to terminate
+//! - `LAMELLAR_DEADLOCK_WARNING_TIMEOUT` - the timeout in seconds before a deadlock warning is printed. Defaults to 600, set to 0 to disable. Note this does not cause your application to terminate
 //! - `LAMELLAR_AM_GROUP_BATCH_SIZE` - The maximum number of sub messages that will be sent in a single AMGroup Active Message, default: 10000
 //! - `LAMELLAR_BLOCKING_CALL_WARNING` - flag used to print warnings when users call barriers on worker threads. Default: true
+//! - `LAMELLAR_DROPPED_UNUSED_HANDLE_WARNING` - flag used to print warnings when users drop active message handles without awaiting, spawning, or blocking on them. Default:
+//! - `LAMELLAR_UNSPAWNED_TASK_WARNING` - flag used to print warnings when users attempt to call wait_all while there are tasks that have not been spawned. Default: true
 //! - `LAMELLAR_BARRIER_DISSEMINATION_FACTOR` - (Experimental) The dissemination factor for the n-way barrier, default: 2
 //! - `LAMELLAR_BATCH_OP_THREADS` - the number of threads used to initiate batched operations, defaults to 1/4 LAMELLAR_THREADS
 //! - `LAMELLAR_ARRAY_INDEX_SIZE` - specify static or dynamic array index size
@@ -155,6 +157,12 @@ pub struct Config {
     /// flag used to print warnings when users call barriers on worker threads. Default: true
     pub blocking_call_warning: Option<bool>,
 
+    /// flag used to print warnings when users drop active message handles without awaiting, spawning, or blocking on them. Default: true
+    pub dropped_unused_handle_warning: Option<bool>,
+
+    /// flag used to print warnings when users attempt to call wait_all while there are tasks that have not been spawned. Default: true
+    pub unpspawned_task_warning: Option<bool>,
+
     /// The lamellae backend to use
     /// rofi -- multi pe distributed execution, default if rofi feature is turned on
     /// local -- single pe execution, default if rofi feature is turned off
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 1d6a8e1e..4091bd76 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -9,6 +9,7 @@ use crate::lamellar_request::*;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeam, LamellarTeamRT};
 use crate::memregion::one_sided::MemRegionHandleInner;
 use crate::scheduler::{LamellarTask, ReqId, Scheduler};
+use crate::warnings::RuntimeWarning;
 use crate::Darc;
 
 // use crossbeam::utils::CachePadded;
@@ -32,9 +33,9 @@ pub(crate) struct TaskGroupAmHandleInner {
     cnt: Arc<AtomicUsize>,
     data: Mutex<HashMap<usize, InternalResult>>, //<sub_id, result>
     wakers: Mutex<HashMap<usize, Waker>>,
-    team_outstanding_reqs: Arc<AtomicUsize>,
-    world_outstanding_reqs: Arc<AtomicUsize>,
-    tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+    team_counters: Arc<AMCounters>,
+    world_counters: Arc<AMCounters>,
+    tg_counters: Option<Arc<AMCounters>>,
     pub(crate) scheduler: Arc<Scheduler>,
     // pending_reqs: Arc<Mutex<HashSet<usize>>>,
 }
@@ -44,6 +45,7 @@ pub(crate) struct TaskGroupAmHandleInner {
 #[pin_project(PinnedDrop)]
 pub struct TaskGroupAmHandle<T: AmDist> {
     inner: Arc<TaskGroupAmHandleInner>,
+    am: Option<(Am, usize)>,
     sub_id: usize,
     _phantom: std::marker::PhantomData<T>,
 }
@@ -67,13 +69,11 @@ impl LamellarRequestAddResult for TaskGroupAmHandleInner {
         }
     }
     fn update_counters(&self, _sub_id: usize) {
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!("tg update counter team {} world {}",_team_reqs-1,_world_req-1);
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        self.team_counters.dec_outstanding(1);
+        self.world_counters.dec_outstanding(1);
+        if let Some(tg_counters) = self.tg_counters.clone() {
+            tg_counters.dec_outstanding(1);
         }
-        // self.pending_reqs.lock().remove(&sub_id);
     }
 }
 
@@ -120,22 +120,45 @@ impl<T: AmDist> TaskGroupAmHandle<T> {
         }
     }
 
+    fn launch_am_if_needed(&mut self) {
+        if let Some((am, num_pes)) = self.am.take() {
+            self.inner.team_counters.inc_outstanding(num_pes);
+            self.inner.team_counters.inc_launched(num_pes);
+            self.inner.world_counters.inc_outstanding(num_pes);
+            self.inner.world_counters.inc_launched(num_pes);
+            if let Some(tg_counters) = self.inner.tg_counters.clone() {
+                tg_counters.inc_outstanding(num_pes);
+                tg_counters.inc_launched(num_pes);
+            }
+
+            self.inner.scheduler.submit_am(am);
+        }
+    }
+
     /// This method will spawn the associated Active Message on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
-    pub fn spawn(self) -> LamellarTask<T> {
+    pub fn spawn(mut self) -> LamellarTask<T> {
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().spawn_task(self)
     }
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> T {
+    pub fn block(mut self) -> T {
+        RuntimeWarning::BlockingCall(
+            "TaskGroupAmHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().block_on(self)
     }
 }
 
 impl<T: AmDist> LamellarRequest for TaskGroupAmHandle<T> {
-    fn blocking_wait(self) -> Self::Output {
+    fn blocking_wait(mut self) -> Self::Output {
+        self.launch_am_if_needed();
         let mut res = self.inner.data.lock().remove(&self.sub_id);
         while res.is_none() {
             self.inner.scheduler.exec_task();
@@ -145,6 +168,7 @@ impl<T: AmDist> LamellarRequest for TaskGroupAmHandle<T> {
     }
 
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.launch_am_if_needed();
         let data = self.inner.data.lock();
         if data.contains_key(&self.sub_id) {
             true
@@ -180,6 +204,7 @@ impl<T: AmDist> LamellarRequest for TaskGroupAmHandle<T> {
 impl<T: AmDist> Future for TaskGroupAmHandle<T> {
     type Output = T;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launch_am_if_needed();
         let mut this = self.as_mut();
         if this.ready_or_set_waker(cx.waker()) {
             Poll::Ready(
@@ -203,9 +228,9 @@ pub(crate) struct TaskGroupMultiAmHandleInner {
     arch: Arc<LamellarArchRT>,
     data: Mutex<HashMap<usize, HashMap<usize, InternalResult>>>, //<sub_id, <pe, result>>
     wakers: Mutex<HashMap<usize, Waker>>,
-    team_outstanding_reqs: Arc<AtomicUsize>,
-    world_outstanding_reqs: Arc<AtomicUsize>,
-    tg_outstanding_reqs: Option<Arc<AtomicUsize>>,
+    team_counters: Arc<AMCounters>,
+    world_counters: Arc<AMCounters>,
+    tg_counters: Option<Arc<AMCounters>>,
     pub(crate) scheduler: Arc<Scheduler>,
     // pending_reqs: Arc<Mutex<HashSet<usize>>>,
 }
@@ -215,6 +240,7 @@ pub(crate) struct TaskGroupMultiAmHandleInner {
 #[pin_project(PinnedDrop)]
 pub struct TaskGroupMultiAmHandle<T: AmDist> {
     inner: Arc<TaskGroupMultiAmHandleInner>,
+    am: Option<(Am, usize)>,
     sub_id: usize,
     _phantom: std::marker::PhantomData<T>,
 }
@@ -242,13 +268,11 @@ impl LamellarRequestAddResult for TaskGroupMultiAmHandleInner {
         }
     }
     fn update_counters(&self, _sub_id: usize) {
-        let _team_reqs = self.team_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        let _world_req = self.world_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
-        // println!("tg update counter team {} world {}",_team_reqs-1,_world_req-1);
-        if let Some(tg_outstanding_reqs) = self.tg_outstanding_reqs.clone() {
-            tg_outstanding_reqs.fetch_sub(1, Ordering::SeqCst);
+        self.team_counters.dec_outstanding(1);
+        self.world_counters.dec_outstanding(1);
+        if let Some(tg_counters) = self.tg_counters.clone() {
+            tg_counters.dec_outstanding(1);
         }
-        // self.pending_reqs.lock().remove(&sub_id);
     }
 }
 
@@ -295,22 +319,44 @@ impl<T: AmDist> TaskGroupMultiAmHandle<T> {
         }
     }
 
+    fn launch_am_if_needed(&mut self) {
+        if let Some((am, num_pes)) = self.am.take() {
+            self.inner.team_counters.inc_outstanding(num_pes);
+            self.inner.team_counters.inc_launched(num_pes);
+            self.inner.world_counters.inc_outstanding(num_pes);
+            self.inner.world_counters.inc_launched(num_pes);
+            if let Some(tg_counters) = self.inner.tg_counters.clone() {
+                tg_counters.inc_outstanding(num_pes);
+                tg_counters.inc_launched(num_pes);
+            }
+            self.inner.scheduler.submit_am(am);
+        }
+    }
+
     /// This method will spawn the associated Active Message on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
-    pub fn spawn(self) -> LamellarTask<Vec<T>> {
+    pub fn spawn(mut self) -> LamellarTask<Vec<T>> {
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().spawn_task(self)
     }
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> Vec<T> {
+    pub fn block(mut self) -> Vec<T> {
+        RuntimeWarning::BlockingCall(
+            "TaskGroupMultiAmHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().block_on(self)
     }
 }
 
 impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
-    fn blocking_wait(self) -> Self::Output {
+    fn blocking_wait(mut self) -> Self::Output {
+        self.launch_am_if_needed();
         while !self.inner.data.lock().contains_key(&self.sub_id) {
             self.inner.scheduler.exec_task();
         }
@@ -341,6 +387,7 @@ impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
     }
 
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.launch_am_if_needed();
         let data = self.inner.data.lock();
         if let Some(req) = data.get(&self.sub_id) {
             req.len() == self.inner.arch.num_pes()
@@ -382,6 +429,7 @@ impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
 impl<T: AmDist> Future for TaskGroupMultiAmHandle<T> {
     type Output = Vec<T>;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launch_am_if_needed();
         let mut this = self.as_mut();
         if this.ready_or_set_waker(cx.waker()) {
             let mut sub_id_map = this
@@ -408,6 +456,7 @@ impl<T: AmDist> Future for TaskGroupMultiAmHandle<T> {
 #[pin_project(PinnedDrop)]
 pub struct TaskGroupLocalAmHandle<T> {
     inner: Arc<TaskGroupAmHandleInner>,
+    am: Option<(Am, usize)>,
     sub_id: usize,
     _phantom: std::marker::PhantomData<T>,
 }
@@ -441,6 +490,20 @@ impl<T: 'static> TaskGroupLocalAmHandle<T> {
             }
         }
     }
+
+    fn launch_am_if_needed(&mut self) {
+        if let Some((am, num_pes)) = self.am.take() {
+            self.inner.team_counters.inc_outstanding(num_pes);
+            self.inner.team_counters.inc_launched(num_pes);
+            self.inner.world_counters.inc_outstanding(num_pes);
+            self.inner.world_counters.inc_launched(num_pes);
+            if let Some(tg_counters) = self.inner.tg_counters.clone() {
+                tg_counters.inc_outstanding(num_pes);
+                tg_counters.inc_launched(num_pes);
+            }
+            self.inner.scheduler.submit_am(am);
+        }
+    }
 }
 
 impl<T: Send + 'static> TaskGroupLocalAmHandle<T> {
@@ -449,17 +512,25 @@ impl<T: Send + 'static> TaskGroupLocalAmHandle<T> {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
-    pub fn spawn(self) -> LamellarTask<T> {
+    pub fn spawn(mut self) -> LamellarTask<T> {
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().spawn_task(self)
     }
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> T {
+    pub fn block(mut self) -> T {
+        RuntimeWarning::BlockingCall(
+            "TaskGroupLocalAmHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.launch_am_if_needed();
         self.inner.scheduler.clone().block_on(self)
     }
 }
 
 impl<T: 'static> LamellarRequest for TaskGroupLocalAmHandle<T> {
-    fn blocking_wait(self) -> Self::Output {
+    fn blocking_wait(mut self) -> Self::Output {
+        self.launch_am_if_needed();
         let mut res = self.inner.data.lock().remove(&self.sub_id);
         while res.is_none() {
             self.inner.scheduler.exec_task();
@@ -469,6 +540,7 @@ impl<T: 'static> LamellarRequest for TaskGroupLocalAmHandle<T> {
     }
 
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.launch_am_if_needed();
         let data = self.inner.data.lock();
         if data.contains_key(&self.sub_id) {
             // println!("request ready {:?}", self.sub_id);
@@ -496,6 +568,7 @@ impl<T: 'static> LamellarRequest for TaskGroupLocalAmHandle<T> {
 impl<T: 'static> Future for TaskGroupLocalAmHandle<T> {
     type Output = T;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launch_am_if_needed();
         let mut this = self.as_mut();
         if this.ready_or_set_waker(cx.waker()) {
             Poll::Ready(
@@ -569,7 +642,7 @@ pub struct LamellarTaskGroup {
     local_id: usize, //for exec_local requests -- is actually the pointer to the rt_local_req  (but *const are not sync so we use usize)
     sub_id_counter: AtomicUsize,
     cnt: Arc<AtomicUsize>, // handle reference count, so that we don't need to worry about storing results if all handles are dropped
-    pub(crate) counters: AMCounters,
+    pub(crate) counters: Arc<AMCounters>,
     //these are cloned and returned to user for each request
     req: Arc<TaskGroupAmHandleInner>,
     multi_req: Arc<TaskGroupMultiAmHandleInner>,
@@ -671,16 +744,16 @@ impl LamellarTaskGroup {
     ///```
     pub fn new<U: Into<IntoLamellarTeam>>(team: U) -> LamellarTaskGroup {
         let team = team.into().team.clone();
-        let counters = AMCounters::new();
+        let counters = Arc::new(AMCounters::new());
         let cnt = Arc::new(AtomicUsize::new(1)); //this lamellarTaskGroup instance represents 1 handle (even though we maintain a single and multi req handle)
                                                  // let pending_reqs = Arc::new(Mutex::new(HashSet::new()));
         let req = Arc::new(TaskGroupAmHandleInner {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
             wakers: Mutex::new(HashMap::new()),
-            team_outstanding_reqs: team.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
+            team_counters: team.team_counters.clone(),
+            world_counters: team.world_counters.clone(),
+            tg_counters: Some(counters.clone()),
             scheduler: team.scheduler.clone(),
             // pending_reqs: pending_reqs.clone(),
         });
@@ -690,9 +763,9 @@ impl LamellarTaskGroup {
             arch: team.arch.clone(),
             data: Mutex::new(HashMap::new()),
             wakers: Mutex::new(HashMap::new()),
-            team_outstanding_reqs: team.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
+            team_counters: team.team_counters.clone(),
+            world_counters: team.world_counters.clone(),
+            tg_counters: Some(counters.clone()),
             scheduler: team.scheduler.clone(),
             // pending_reqs: pending_reqs.clone(),
         });
@@ -701,9 +774,9 @@ impl LamellarTaskGroup {
             cnt: cnt.clone(),
             data: Mutex::new(HashMap::new()),
             wakers: Mutex::new(HashMap::new()),
-            team_outstanding_reqs: team.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: team.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: Some(counters.outstanding_reqs.clone()),
+            team_counters: team.team_counters.clone(),
+            world_counters: team.world_counters.clone(),
+            tg_counters: Some(counters.clone()),
             scheduler: team.scheduler.clone(),
             // pending_reqs: pending_reqs.clone(),
         });
@@ -727,23 +800,22 @@ impl LamellarTaskGroup {
     }
 
     fn wait_all(&self) {
-        let mut exec_task = true;
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-            exec_task = false;
+        RuntimeWarning::BlockingCall("wait_all", "await_all().await").print();
+
+        if self.counters.send_req_cnt.load(Ordering::SeqCst)
+            != self.counters.launched_req_cnt.load(Ordering::SeqCst)
+            || self.counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.counters.launched_req_cnt.load(Ordering::SeqCst)
+        {
+            RuntimeWarning::UnspanedTask(
+                        "`wait_all` on an active message group before all tasks/active messages create by the group have been spawned",
+                    )
+                    .print();
         }
         let mut temp_now = Instant::now();
         while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
             // self.team.flush();
-            if exec_task {
+            if std::thread::current().id() != *crate::MAIN_THREAD {
                 self.team.scheduler.exec_task();
             }
             if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
@@ -766,6 +838,16 @@ impl LamellarTaskGroup {
     }
 
     async fn await_all(&self) {
+        if self.counters.send_req_cnt.load(Ordering::SeqCst)
+            != self.counters.launched_req_cnt.load(Ordering::SeqCst)
+            || self.counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.counters.launched_req_cnt.load(Ordering::SeqCst)
+        {
+            RuntimeWarning::UnspanedTask(
+                        "`await_all` on an active message group before all tasks/active messages created by the group have been spawned",
+                    )
+                    .print();
+        }
         let mut temp_now = Instant::now();
         while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
             // self.team.flush();
@@ -791,9 +873,9 @@ impl LamellarTaskGroup {
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
         // println!("task group exec am all");
-        self.team.team_counters.add_send_req(self.team.num_pes);
-        self.team.world_counters.add_send_req(self.team.num_pes);
-        self.counters.add_send_req(self.team.num_pes);
+        self.team.team_counters.inc_send_req(self.team.num_pes);
+        self.team.world_counters.inc_send_req(self.team.num_pes);
+        self.counters.inc_send_req(self.team.num_pes);
         // println!("cnts: t: {} w: {} self: {:?}",self.team.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.team.world_counters.outstanding_reqs.load(Ordering::Relaxed), self.counters.outstanding_reqs.load(Ordering::Relaxed));
 
         self.cnt.fetch_add(1, Ordering::SeqCst);
@@ -823,9 +905,10 @@ impl LamellarTaskGroup {
             team_addr: self.team.remote_ptr_addr,
         };
         // println!("[{:?}] task group am all", std::thread::current().id());
-        self.team.scheduler.submit_am(Am::All(req_data, func));
+        // self.team.scheduler.submit_am();
         TaskGroupMultiAmHandle {
             inner: self.multi_req.clone(),
+            am: Some((Am::All(req_data, func), self.team.num_pes)),
             sub_id: req_id.sub_id,
             _phantom: PhantomData,
         }
@@ -836,9 +919,9 @@ impl LamellarTaskGroup {
         F: RemoteActiveMessage + LamellarAM + Serde + AmDist,
     {
         // println!("task group exec am pe");
-        self.team.team_counters.add_send_req(1);
-        self.team.world_counters.add_send_req(1);
-        self.counters.add_send_req(1);
+        self.team.team_counters.inc_send_req(1);
+        self.team.world_counters.inc_send_req(1);
+        self.counters.inc_send_req(1);
         // println!("cnts: t: {} w: {} self: {:?}",self.team.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.team.world_counters.outstanding_reqs.load(Ordering::Relaxed), self.counters.outstanding_reqs.load(Ordering::Relaxed));
 
         self.cnt.fetch_add(1, Ordering::SeqCst);
@@ -864,9 +947,10 @@ impl LamellarTaskGroup {
             team_addr: self.team.remote_ptr_addr,
         };
         // println!("[{:?}] task group am pe", std::thread::current().id());
-        self.team.scheduler.submit_am(Am::Remote(req_data, func));
+        // self.team.scheduler.submit_am(Am::Remote(req_data, func));
         TaskGroupAmHandle {
             inner: self.req.clone(),
+            am: Some((Am::Remote(req_data, func), 1)),
             sub_id: req_id.sub_id,
             _phantom: PhantomData,
         }
@@ -884,9 +968,9 @@ impl LamellarTaskGroup {
         func: LamellarArcLocalAm,
     ) -> TaskGroupLocalAmHandle<O> {
         // println!("task group exec am local");
-        self.team.team_counters.add_send_req(1);
-        self.team.world_counters.add_send_req(1);
-        self.counters.add_send_req(1);
+        self.team.team_counters.inc_send_req(1);
+        self.team.world_counters.inc_send_req(1);
+        self.counters.inc_send_req(1);
         // println!("cnts: t: {} w: {} self: {:?}",self.team.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.team.world_counters.outstanding_reqs.load(Ordering::Relaxed), self.counters.outstanding_reqs.load(Ordering::Relaxed));
 
         self.cnt.fetch_add(1, Ordering::SeqCst);
@@ -911,7 +995,7 @@ impl LamellarTaskGroup {
             team_addr: self.team.remote_ptr_addr,
         };
         // println!("[{:?}] task group am local", std::thread::current().id());
-        self.team.scheduler.submit_am(Am::Local(req_data, func));
+        // self.team.scheduler.submit_am(Am::Local(req_data, func));
         // Box::new(TaskGroupLocalAmHandle {
         //     inner: self.local_req.clone(),
         //     sub_id: req_id.sub_id,
@@ -919,6 +1003,7 @@ impl LamellarTaskGroup {
         // })
         TaskGroupLocalAmHandle {
             inner: self.local_req.clone(),
+            am: Some((Am::Local(req_data, func), 1)),
             sub_id: req_id.sub_id,
             _phantom: PhantomData,
         }
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index e783e06f..12d80c46 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -12,6 +12,8 @@ use crate::memregion::{
     MemoryRegion, RemoteMemoryRegion,
 };
 use crate::scheduler::{LamellarTask, ReqId, Scheduler};
+use crate::warnings::RuntimeWarning;
+
 #[cfg(feature = "nightly")]
 use crate::utils::ser_closure;
 
@@ -738,7 +740,7 @@ pub struct LamellarTeamRT {
     pub(crate) num_world_pes: usize,
     pub(crate) team_pe: Result<usize, IdError>,
     pub(crate) num_pes: usize,
-    pub(crate) team_counters: AMCounters,
+    pub(crate) team_counters: Arc<AMCounters>,
     pub(crate) world_counters: Arc<AMCounters>, // can probably remove this?
     pub(crate) id: usize,
     sub_team_id_cnt: AtomicUsize,
@@ -861,7 +863,7 @@ impl LamellarTeamRT {
             team_pe: Ok(world_pe),
             num_world_pes: num_pes,
             num_pes: num_pes,
-            team_counters: AMCounters::new(),
+            team_counters: Arc::new(AMCounters::new()),
             world_counters: world_counters,
             id: 0,
             team_hash: 0, //easy id to look up for global
@@ -1119,7 +1121,7 @@ impl LamellarTeamRT {
                 num_world_pes: parent.num_world_pes,
                 team_pe: archrt.team_pe(parent.world_pe),
                 num_pes: num_pes,
-                team_counters: AMCounters::new(),
+                team_counters: Arc::new(AMCounters::new()),
                 world_counters: parent.world_counters.clone(),
                 id: id,
                 sub_team_id_cnt: AtomicUsize::new(0),
@@ -1348,18 +1350,14 @@ impl LamellarTeamRT {
     }
     // }
 
-    pub(crate) fn inc_counters(&self, cnt: usize) {
-        self.team_counters.add_send_req(cnt);
-        self.world_counters.add_send_req(cnt);
+    pub(crate) fn inc_outstanding(&self, cnt: usize) {
+        self.team_counters.inc_outstanding(cnt);
+        self.world_counters.inc_outstanding(cnt);
     }
 
-    pub(crate) fn dec_counters(&self, cnt: usize) {
-        self.team_counters
-            .outstanding_reqs
-            .fetch_sub(cnt, Ordering::SeqCst);
-        self.world_counters
-            .outstanding_reqs
-            .fetch_sub(cnt, Ordering::SeqCst);
+    pub(crate) fn dec_outstanding(&self, cnt: usize) {
+        self.team_counters.dec_outstanding(cnt);
+        self.world_counters.dec_outstanding(cnt);
     }
 
     pub(crate) fn spawn<F>(&self, task: F) -> LamellarTask<F::Output>
@@ -1374,20 +1372,27 @@ impl LamellarTeamRT {
     //#[tracing::instrument(skip_all)]
     pub(crate) fn wait_all(&self) {
         // println!("wait_all called on pe: {}", self.world_pe);
-        let mut exec_task = true;
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            if let Some(val) = config().blocking_call_warning {
-                if val {
-                    println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                    Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-                }
-            } else {
-                println!("[LAMELLAR WARNING] You are calling wait_all from within an async context, it is recommended that you use `await_all().await;` instead! 
-                Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning, Set RUST_LIB_BACKTRACE=1 to see where the call is occcuring: {}", std::backtrace::Backtrace::capture());
-            }
-            exec_task = false;
+
+        RuntimeWarning::BlockingCall("wait_all", "await_all().await").print();
+        if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+            != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
+            || self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst)
+        {
+            RuntimeWarning::UnspanedTask(
+                "`wait_all` before all tasks/active messages have been spawned",
+            )
+            .print();
+            println!(
+                "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
+                self.world_pe,
+                self.team_counters.send_req_cnt.load(Ordering::SeqCst),
+                self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
+                self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
+            );
         }
         let mut temp_now = Instant::now();
+
         // println!(
         //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
         //     self.world_pe,
@@ -1401,7 +1406,7 @@ impl LamellarTeamRT {
         {
             // std::thread::yield_now();
             // self.flush();
-            if exec_task {
+            if std::thread::current().id() != *crate::MAIN_THREAD {
                 self.scheduler.exec_task()
             }; //mmight as well do useful work while we wait }
             if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
@@ -1422,6 +1427,16 @@ impl LamellarTeamRT {
         // );
     }
     pub(crate) async fn await_all(&self) {
+        if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+            != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
+            || self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst)
+        {
+            RuntimeWarning::UnspanedTask(
+                "`await_all` before all tasks/active messages have been spawned",
+            )
+            .print();
+        }
         let mut temp_now = Instant::now();
         while self.panic.load(Ordering::SeqCst) == 0
             && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
@@ -1505,21 +1520,17 @@ impl LamellarTeamRT {
         // println!("team exec am all num_pes {:?}", self.num_pes);
         // trace!("[{:?}] team exec am all request", self.world_pe);
         // event!(Level::TRACE, "team exec am all request");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(self.num_pes);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
         let req = Arc::new(MultiAmHandleInner {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             data: Mutex::new(HashMap::new()),
             waker: Mutex::new(None),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -1535,8 +1546,8 @@ impl LamellarTeamRT {
             sub_id: 0,
         };
 
-        self.world_counters.add_send_req(self.num_pes);
-        self.team_counters.add_send_req(self.num_pes);
+        self.world_counters.inc_send_req(self.num_pes);
+        self.team_counters.inc_send_req(self.num_pes);
         // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
         // println!("team counter: {:?}", self.team_counters.outstanding_reqs);
@@ -1557,9 +1568,10 @@ impl LamellarTeamRT {
         };
         // event!(Level::TRACE, "submitting request to scheduler");
         // println!("[{:?}] team exec all", std::thread::current().id());
-        self.scheduler.submit_am(Am::All(req_data, func));
+        // self.scheduler.submit_am(Am::All(req_data, func));
         MultiAmHandle {
             inner: req,
+            am: Some((Am::All(req_data, func), self.num_pes)),
             _phantom: PhantomData,
         }
     }
@@ -1577,21 +1589,18 @@ impl LamellarTeamRT {
         // println!("team exec am all num_pes {:?}", self.num_pes);
         // trace!("[{:?}] team exec am all request", self.world_pe);
         // event!(Level::TRACE, "team exec am all request");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(self.num_pes);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
+
         let req = Arc::new(MultiAmHandleInner {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             data: Mutex::new(HashMap::new()),
             waker: Mutex::new(None),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -1607,8 +1616,8 @@ impl LamellarTeamRT {
             sub_id: 0,
         };
 
-        self.world_counters.add_send_req(self.num_pes);
-        self.team_counters.add_send_req(self.num_pes);
+        self.world_counters.inc_send_req(self.num_pes);
+        self.team_counters.inc_send_req(self.num_pes);
         // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
         // println!("team counter: {:?}", self.team_counters.outstanding_reqs);
@@ -1629,9 +1638,10 @@ impl LamellarTeamRT {
         };
         // event!(Level::TRACE, "submitting request to scheduler");
         // println!("[{:?}] team am group exec all", std::thread::current().id());
-        self.scheduler.submit_am(Am::All(req_data, func));
+        // self.scheduler.submit_am(Am::All(req_data, func));
         MultiAmHandle {
             inner: req,
+            am: Some((Am::All(req_data, func), self.num_pes)),
             _phantom: PhantomData,
         }
     }
@@ -1655,22 +1665,18 @@ impl LamellarTeamRT {
         F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
     {
         // println!("team exec am pe tg");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(1);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
         assert!(pe < self.arch.num_pes());
 
         let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -1681,8 +1687,8 @@ impl LamellarTeamRT {
             id: req_ptr as usize,
             sub_id: 0,
         };
-        self.world_counters.add_send_req(1);
-        self.team_counters.add_send_req(1);
+        self.world_counters.inc_send_req(1);
+        self.team_counters.inc_send_req(1);
         // println!(
         //     "req_id: {:?} tc: {:?} wc: {:?}",
         //     id,
@@ -1709,7 +1715,7 @@ impl LamellarTeamRT {
         };
 
         // println!("[{:?}] team exec am pe tg", std::thread::current().id());
-        self.scheduler.submit_am(Am::Remote(req_data, func));
+        // self.scheduler.submit_am(Am::Remote(req_data, func));
 
         // Box::new(LamellarRequestHandle {
         //     inner: req,
@@ -1717,6 +1723,7 @@ impl LamellarTeamRT {
         // })
         AmHandle {
             inner: req,
+            am: Some((Am::Remote(req_data, func), 1)),
             _phantom: PhantomData,
         }
         .into()
@@ -1734,22 +1741,18 @@ impl LamellarTeamRT {
         O: AmDist + 'static,
     {
         // println!("team exec am pe tg");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(1);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
         assert!(pe < self.arch.num_pes());
 
         let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -1760,8 +1763,8 @@ impl LamellarTeamRT {
             id: req_ptr as usize,
             sub_id: 0,
         };
-        self.world_counters.add_send_req(1);
-        self.team_counters.add_send_req(1);
+        self.world_counters.inc_send_req(1);
+        self.team_counters.inc_send_req(1);
         // println!(
         //     "req_id: {:?} tc: {:?} wc: {:?}",
         //     id,
@@ -1791,7 +1794,7 @@ impl LamellarTeamRT {
         //     "[{:?}] team am group exec am pe tg",
         //     std::thread::current().id()
         // );
-        self.scheduler.submit_am(Am::Remote(req_data, func));
+        // self.scheduler.submit_am(Am::Remote(req_data, func));
 
         // Box::new(LamellarRequestHandle {
         //     inner: req,
@@ -1799,6 +1802,7 @@ impl LamellarTeamRT {
         // })
         AmHandle {
             inner: req,
+            am: Some((Am::Remote(req_data, func), 1)),
             _phantom: PhantomData,
         }
     }
@@ -1813,21 +1817,17 @@ impl LamellarTeamRT {
         F: AmDist,
     {
         // println!("team exec arc am pe");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(self.num_pes);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
         let req = Arc::new(MultiAmHandleInner {
             cnt: AtomicUsize::new(self.num_pes),
             arch: self.arch.clone(),
             waker: Mutex::new(None),
             data: Mutex::new(HashMap::new()),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -1841,8 +1841,8 @@ impl LamellarTeamRT {
             id: req_ptr as usize,
             sub_id: 0,
         };
-        self.world_counters.add_send_req(self.num_pes);
-        self.team_counters.add_send_req(self.num_pes);
+        self.world_counters.inc_send_req(self.num_pes);
+        self.team_counters.inc_send_req(self.num_pes);
         // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
         let world = if let Some(world) = &self.world {
@@ -1864,10 +1864,11 @@ impl LamellarTeamRT {
         //     "[{:?}] team arc exec am all tg",
         //     std::thread::current().id()
         // );
-        self.scheduler.submit_am(Am::All(req_data, am));
+        // self.scheduler.submit_am(Am::All(req_data, am));
 
         MultiAmHandle {
             inner: req,
+            am: Some((Am::All(req_data, am), self.num_pes)),
             _phantom: PhantomData,
         }
     }
@@ -1883,21 +1884,17 @@ impl LamellarTeamRT {
         F: AmDist,
     {
         // println!("team exec arc am pe");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(1);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
         assert!(pe < self.arch.num_pes());
         let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -1907,8 +1904,8 @@ impl LamellarTeamRT {
             id: req_ptr as usize,
             sub_id: 0,
         };
-        self.world_counters.add_send_req(1);
-        self.team_counters.add_send_req(1);
+        self.world_counters.inc_send_req(1);
+        self.team_counters.inc_send_req(1);
         // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
         let world = if let Some(world) = &self.world {
@@ -1927,7 +1924,7 @@ impl LamellarTeamRT {
         };
 
         // println!("[{:?}] team arc exec am pe", std::thread::current().id());
-        self.scheduler.submit_am(Am::Remote(req_data, am));
+        // self.scheduler.submit_am(Am::Remote(req_data, am));
 
         // Box::new(LamellarRequestHandle {
         //     inner: req,
@@ -1935,6 +1932,7 @@ impl LamellarTeamRT {
         // })
         AmHandle {
             inner: req,
+            am: Some((Am::Remote(req_data, am), 1)),
             _phantom: PhantomData,
         }
         .into()
@@ -1951,21 +1949,17 @@ impl LamellarTeamRT {
         F: AmDist,
     {
         // println!("team exec arc am pe");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(1);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
         assert!(pe < self.arch.num_pes());
         let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -1975,8 +1969,8 @@ impl LamellarTeamRT {
             id: req_ptr as usize,
             sub_id: 0,
         };
-        self.world_counters.add_send_req(1);
-        self.team_counters.add_send_req(1);
+        self.world_counters.inc_send_req(1);
+        self.team_counters.inc_send_req(1);
         // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
         let world = if let Some(world) = &self.world {
@@ -2003,6 +1997,7 @@ impl LamellarTeamRT {
         // })
         AmHandle {
             inner: req,
+            am: None,
             _phantom: PhantomData,
         }
         .into()
@@ -2026,20 +2021,16 @@ impl LamellarTeamRT {
         F: LamellarActiveMessage + LocalAM + 'static,
     {
         // println!("team exec am local");
-        let tg_outstanding_reqs = match task_group_cnts {
-            Some(task_group_cnts) => {
-                task_group_cnts.add_send_req(1);
-                Some(task_group_cnts.outstanding_reqs.clone())
-            }
-            None => None,
-        };
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_send_req(1);
+        }
         let req = Arc::new(AmHandleInner {
             ready: AtomicBool::new(false),
             data: Cell::new(None),
             waker: Mutex::new(None),
-            team_outstanding_reqs: self.team_counters.outstanding_reqs.clone(),
-            world_outstanding_reqs: self.world_counters.outstanding_reqs.clone(),
-            tg_outstanding_reqs: tg_outstanding_reqs.clone(),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
             user_handle: AtomicU8::new(1),
             scheduler: self.scheduler.clone(),
         });
@@ -2050,8 +2041,8 @@ impl LamellarTeamRT {
             sub_id: 0,
         };
 
-        self.world_counters.add_send_req(1);
-        self.team_counters.add_send_req(1);
+        self.world_counters.inc_send_req(1);
+        self.team_counters.inc_send_req(1);
         // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
         let func: LamellarArcLocalAm = Arc::new(am);
@@ -2071,7 +2062,7 @@ impl LamellarTeamRT {
             team_addr: self.remote_ptr_addr,
         };
         // println!("[{:?}] team exec am local", std::thread::current().id());
-        self.scheduler.submit_am(Am::Local(req_data, func));
+        // self.scheduler.submit_am(Am::Local(req_data, func));
 
         // Box::new(LamellarLocalRequestHandle {
         //     inner: req,
@@ -2079,6 +2070,7 @@ impl LamellarTeamRT {
         // })
         LocalAmHandle {
             inner: req,
+            am: Some((Am::Local(req_data, func), 1)),
             _phantom: PhantomData,
         }
     }
diff --git a/src/lib.rs b/src/lib.rs
index c50eb5a3..96a9f9f1 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -272,6 +272,8 @@ mod utils;
 //#[doc(hidden)]
 pub use utils::*;
 
+pub(crate) mod warnings;
+
 pub mod env_var;
 pub use env_var::config;
 
diff --git a/src/scheduler.rs b/src/scheduler.rs
index cffaecd9..3d7bb1c5 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -5,6 +5,7 @@ use crate::active_messaging::registered_active_message::RegisteredActiveMessages
 use crate::active_messaging::*;
 use crate::env_var::config;
 use crate::lamellae::{Des, Lamellae, SerializedData};
+use crate::warnings::RuntimeWarning;
 
 use enum_dispatch::enum_dispatch;
 use futures_util::Future;
@@ -103,15 +104,7 @@ pub struct LamellarTask<T> {
 
 impl<T> LamellarTask<T> {
     pub fn block(self) -> T {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            println!(
-                "[LAMELLAR WARNING] trying to call block on within a worker thread {:?} this may result in deadlock.
-                Typically this means you are running within an async context. If you have something like:
-                world.block_on(my_future) you can simply change to my_future.await. If this is not the case,
-                please file an issue on github.",
-                std::backtrace::Backtrace::capture()
-            )
-        }
+        RuntimeWarning::BlockingCall("LamellarTask::block", "<task>.await").print();
         self.executor.clone().block_on(self)
     }
 }
@@ -509,15 +502,7 @@ impl Scheduler {
     }
 
     pub(crate) fn block_on<F: Future>(&self, task: F) -> F::Output {
-        if std::thread::current().id() != *crate::MAIN_THREAD {
-            println!(
-                "[LAMELLAR WARNING] trying to call block on within a worker thread {:?} this may result in deadlock.
-                Typically this means you are running within an async context. If you have something like:
-                world.block_on(my_future) you can simply change to my_future.await. If this is not the case,
-                please file an issue on github.",
-                std::backtrace::Backtrace::capture()
-            )
-        }
+        RuntimeWarning::BlockOn.print();
         self.executor.block_on(task)
     }
 
diff --git a/src/warnings.rs b/src/warnings.rs
new file mode 100644
index 00000000..accb47a8
--- /dev/null
+++ b/src/warnings.rs
@@ -0,0 +1,99 @@
+use crate::config;
+
+pub(crate) enum RuntimeWarning<'a> {
+    UnspanedTask(&'a str),
+    DroppedHandle(&'a str),
+    BlockingCall(&'a str, &'a str),
+    BlockOn,
+    // AsyncDeadlockCustom(&'a str),
+    BarrierTimeout(f64),
+}
+
+impl<'a> RuntimeWarning<'a> {
+    fn print_warning(&self) -> bool {
+        match self {
+            RuntimeWarning::UnspanedTask(_) => match config().unpspawned_task_warning {
+                Some(true) => true,
+                Some(false) => false,
+                None => true,
+            },
+            RuntimeWarning::DroppedHandle(_) => match config().dropped_unused_handle_warning {
+                Some(true) => true,
+                Some(false) => false,
+                None => true,
+            },
+            RuntimeWarning::BlockingCall(_, _) | RuntimeWarning::BlockOn => {
+                if std::thread::current().id() != *crate::MAIN_THREAD {
+                    match config().blocking_call_warning {
+                        Some(true) => true,
+                        Some(false) => false,
+                        None => true,
+                    }
+                } else {
+                    false
+                }
+            }
+            RuntimeWarning::BarrierTimeout(elapsed) => elapsed > &config().deadlock_timeout,
+        }
+    }
+
+    fn panic(&self, msg: &str) {
+        match self {
+            RuntimeWarning::BarrierTimeout(_) => {}
+            _ => panic!("{msg}
+                Note this warning causes a panic because you have comilpiled lamellar with the `runtime-warnings-panic` feature.
+                Recompile without this feauture to only print warnings, rather than panic.
+                To disable runtime warnings completely, recompile lamellar with the `disable-runtime-warnings` feature.
+                To view backtrace set RUST_LIB_BACKTRACE=1.
+                {}",
+                std::backtrace::Backtrace::capture()),
+        }
+    }
+
+    pub(crate) fn print(self) {
+        #[cfg(not(feature = "disable-runtime-warnings"))]
+        if self.print_warning() {
+            let msg = match self {
+                RuntimeWarning::UnspanedTask(msg) => {
+                    format!("[LAMELLAR WARNING] you have called {msg}. 
+                    This typically means you forgot to call spawn() on the handle returned from calls such as exec_am_* or various array operations.
+                    If this is your intended behavior, set LAMELLAR_UNSPAWNED_TASK_WARNING=0 to disable this warning.")
+                }
+                RuntimeWarning::DroppedHandle(msg) => {
+                    format!("[LAMELLAR WARNING] You are dropping {msg} that has not been 'await'ed, 'spawn()'ed or 'block()'ed on a PE. 
+                    This means any work associated with the AM will not be performed. Set LAMELLAR_DROPPED_UNUSED_HANDLE_WARNING=0 to disable this warning.")
+                }
+                RuntimeWarning::BlockingCall(func, async_func) => {
+                    format!("[LAMELLAR WARNING] You are calling {func} from within an async context, this may result in deadlock! 
+                    Using '{async_func}' is likely a better choice. Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning.")
+                }
+                RuntimeWarning::BlockOn => {
+                    format!("[LAMELLAR WARNING] You are calling block_on from within an async context, this may result in deadlock! 
+                    If you have something like: `world.block_on(my_future)` you can simply change to my_future.await.
+                    If this is not the case, please file an issue on github. Set LAMELLAR_BLOCKING_CALL_WARNING=0 to disable this warning.")
+                }
+                RuntimeWarning::BarrierTimeout(_) => {
+                    format!("[LAMELLAR WARNING][{:?}] You have encoutered a barrier timeout. Potential deadlock detected.
+                    Barrier is a collective operation requiring all PEs associated with the distributed object to enter the barrier call. 
+                    Please refer to https://docs.rs/lamellar/latest/lamellar/index.html?search=barrier for more information.
+                    Note that barriers are often called internally for many collective operations, including constructing new LamellarTeams, LamellarArrays, and Darcs, as well as distributed iteration. 
+                    You may be seeing this message if you have called barrier within an async context (meaning it was executed on a worker thread).
+                    A full list of collective operations is found at https://docs.rs/lamellar/latest/lamellar/index.html?search=collective .
+                    The deadlock timeout can be set via the LAMELLAR_DEADLOCK_WARNING_TIMEOUT environment variable, the current timeout is {} seconds, setting this to 0 will disable this warning.",
+                        std::thread::current().id(), config().deadlock_timeout)
+                }
+            };
+
+            #[cfg(feature = "runtime-warnings-panic")]
+            self.panic(&msg);
+            println!(
+                "{msg}
+                Note that this warning is informative only, and will not terminate your application.
+                To disable runtime warnings completely, recompile lamellar with the `disable-runtime-warnings` feature.
+                To view backtrace set RUST_LIB_BACKTRACE=1.
+                {}",
+                std::backtrace::Backtrace::capture()
+            );
+        }
+    }
+}

From 1376a0d2a45f451b28ab5d4521e3704cd1937273 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 25 Oct 2024 10:31:11 -0700
Subject: [PATCH 101/116] implement dropped and unused warnings for various
 handle types, trying to ensure consistent behavior across lamellar operations

---
 .../array_consumer_schedules.rs               | 249 +++++++--
 examples/array_examples/array_ops.rs          |  22 +-
 .../array_examples/atomic_compare_exchange.rs |   3 +-
 impl/src/array_ops.rs                         |  12 +-
 run_examples.sh                               |   2 +-
 src/active_messaging/handle.rs                |   8 +-
 src/array.rs                                  |  53 +-
 src/array/atomic.rs                           |  11 +
 src/array/generic_atomic/rdma.rs              |   5 +
 src/array/global_lock_atomic.rs               |   1 +
 src/array/global_lock_atomic/handle.rs        |  19 +-
 src/array/global_lock_atomic/rdma.rs          |   5 +
 src/array/handle.rs                           |  63 ++-
 .../distributed_iterator/consumer/collect.rs  | 124 ++---
 .../distributed_iterator/consumer/count.rs    | 120 ++---
 .../distributed_iterator/consumer/for_each.rs | 116 ++---
 .../distributed_iterator/consumer/reduce.rs   | 268 ++--------
 .../distributed_iterator/consumer/sum.rs      | 182 ++++---
 .../local_iterator/consumer/collect.rs        | 117 ++---
 .../iterator/local_iterator/consumer/count.rs | 103 ++--
 .../local_iterator/consumer/for_each.rs       | 112 ++--
 .../local_iterator/consumer/reduce.rs         | 112 ++--
 .../iterator/local_iterator/consumer/sum.rs   | 159 +++---
 src/array/local_lock_atomic.rs                |   7 +
 src/array/local_lock_atomic/handle.rs         |  24 +-
 src/array/local_lock_atomic/iteration.rs      |  21 +-
 src/array/local_lock_atomic/rdma.rs           |   7 +-
 src/array/native_atomic/rdma.rs               |   5 +
 src/array/operations/handle.rs                | 482 +++++++++++-------
 src/array/unsafe.rs                           | 100 +++-
 src/array/unsafe/iteration/local.rs           |   6 +-
 src/array/unsafe/operations.rs                |   4 +-
 src/array/unsafe/rdma.rs                      |   5 +
 src/barrier.rs                                |  89 +---
 src/darc.rs                                   |   2 +
 src/darc/global_rw_darc.rs                    |   2 +
 src/darc/handle.rs                            | 128 ++++-
 src/darc/local_rw_darc.rs                     |   2 +
 src/lamellar_task_group.rs                    |  77 ++-
 src/lamellar_team.rs                          | 104 ++--
 src/lamellar_world.rs                         |  24 +-
 src/scheduler.rs                              |  29 +-
 src/scheduler/work_stealing.rs                |  16 +-
 src/warnings.rs                               |  58 ++-
 44 files changed, 1690 insertions(+), 1368 deletions(-)

diff --git a/examples/array_examples/array_consumer_schedules.rs b/examples/array_examples/array_consumer_schedules.rs
index f54b9890..91ce1048 100644
--- a/examples/array_examples/array_consumer_schedules.rs
+++ b/examples/array_examples/array_consumer_schedules.rs
@@ -17,17 +17,60 @@ fn for_each_with_schedule(
     array
         .local_iter()
         .filter(|e| e.load() % 2 == 0)
-        .for_each_with_schedule(schedule, move |e| {
+        .for_each_with_schedule(schedule.clone(), move |e| {
             std::thread::sleep(Duration::from_millis((e.load() * 1) as u64));
             *tc.lock().entry(std::thread::current().id()).or_insert(0) += 1;
         })
         .block();
     array.barrier();
-    println!("elapsed time {:?}", timer.elapsed().as_secs_f64());
+    println!(
+        "for_each {schedule:?} block elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
     println!("counts {:?}", thread_cnts.lock());
 
     thread_cnts.lock().clear();
     array.barrier();
+
+    let timer = Instant::now();
+    let tc = thread_cnts.clone();
+    let _handle = array
+        .local_iter()
+        .filter(|e| e.load() % 2 == 0)
+        .for_each_with_schedule(schedule.clone(), move |e| {
+            std::thread::sleep(Duration::from_millis((e.load() * 1) as u64));
+            *tc.lock().entry(std::thread::current().id()).or_insert(0) += 1;
+        })
+        .spawn();
+    array.wait_all();
+    array.barrier();
+    println!(
+        "for_each {schedule:?} spawn elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
+    println!("counts {:?}", thread_cnts.lock());
+
+    thread_cnts.lock().clear();
+    array.barrier();
+
+    array.block_on(async move {
+        let timer = Instant::now();
+        let tc = thread_cnts.clone();
+        let _ = array
+            .local_iter()
+            .filter(|e| e.load() % 2 == 0)
+            .for_each_with_schedule(schedule.clone(), move |e| {
+                std::thread::sleep(Duration::from_millis((e.load() * 1) as u64));
+                *tc.lock().entry(std::thread::current().id()).or_insert(0) += 1;
+            })
+            .await;
+        array.async_barrier().await;
+        println!(
+            "for_each {schedule:?} await elapsed await {:?}",
+            timer.elapsed().as_secs_f64()
+        );
+        println!("counts {:?}", thread_cnts.lock());
+    });
 }
 
 fn reduce_with_schedule(
@@ -37,16 +80,54 @@ fn reduce_with_schedule(
 ) {
     let timer = Instant::now();
     let _tc = thread_cnts.clone();
-    let result = array.block_on(
-        array
-            .local_iter()
-            .filter(|e| e.load() % 2 == 0)
-            .map(|e| e.load())
-            .reduce_with_schedule(schedule, |e1, e2| e1 + e2),
+    let result = array
+        .local_iter()
+        .filter(|e| e.load() % 2 == 0)
+        .map(|e| e.load())
+        .reduce_with_schedule(schedule.clone(), |e1, e2| e1 + e2)
+        .block();
+    array.barrier();
+    println!(
+        "reduce {schedule:?} block elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
     );
+    println!("reduced {:?}", result);
+
+    let timer = Instant::now();
+    let _tc = thread_cnts.clone();
+    let result_handle = array
+        .local_iter()
+        .filter(|e| e.load() % 2 == 0)
+        .map(|e| e.load())
+        .reduce_with_schedule(schedule.clone(), |e1, e2| e1 + e2)
+        .spawn();
+
+    println!("about to wait all");
+    array.wait_all();
     array.barrier();
-    println!("elapsed time {:?}", timer.elapsed().as_secs_f64());
+    let result = result_handle.block();
+    println!(
+        "reduce {schedule:?} spawn elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
     println!("reduced {:?}", result);
+
+    array.block_on(async move {
+        let timer = Instant::now();
+        let _tc = thread_cnts.clone();
+        let result = array
+            .local_iter()
+            .filter(|e| e.load() % 2 == 0)
+            .map(|e| e.load())
+            .reduce_with_schedule(schedule.clone(), |e1, e2| e1 + e2)
+            .await;
+        array.async_barrier().await;
+        println!(
+            "reduce {schedule:?} await elapsed time {:?}",
+            timer.elapsed().as_secs_f64()
+        );
+        println!("reduced {:?}", result);
+    });
 }
 
 fn collect_with_schedule(
@@ -56,16 +137,51 @@ fn collect_with_schedule(
 ) {
     let timer = Instant::now();
     let _tc = thread_cnts.clone();
-    let result = array.block_on(
-        array
+    let result = array
+        .local_iter()
+        .filter(|e| e.load() % 2 == 0)
+        .map(|e| e.load())
+        .collect_with_schedule::<Vec<_>>(schedule.clone(), Distribution::Block)
+        .block();
+    array.barrier();
+    println!(
+        "collect {schedule:?} block elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
+    println!("collect {:?}", result);
+
+    let timer = Instant::now();
+    let _tc = thread_cnts.clone();
+    let result_handle = array
+        .local_iter()
+        .filter(|e| e.load() % 2 == 0)
+        .map(|e| e.load())
+        .collect_with_schedule::<Vec<_>>(schedule.clone(), Distribution::Block)
+        .spawn();
+    array.wait_all();
+    array.barrier();
+    println!(
+        "collect {schedule:?} spawn elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
+    println!("collect {:?}", result_handle.block());
+
+    array.block_on(async move {
+        let timer = Instant::now();
+        let _tc = thread_cnts.clone();
+        let result = array
             .local_iter()
             .filter(|e| e.load() % 2 == 0)
             .map(|e| e.load())
-            .collect_with_schedule::<Vec<_>>(schedule, Distribution::Block),
-    );
-    array.barrier();
-    println!("elapsed time {:?}", timer.elapsed().as_secs_f64());
-    println!("collect {:?}", result);
+            .collect_with_schedule::<Vec<_>>(schedule.clone(), Distribution::Block)
+            .await;
+        array.async_barrier().await;
+        println!(
+            "collect {schedule:?} await elapsed time {:?}",
+            timer.elapsed().as_secs_f64()
+        );
+        println!("collect {:?}", result);
+    });
 }
 
 fn count_with_schedule(
@@ -75,15 +191,48 @@ fn count_with_schedule(
 ) {
     let timer = Instant::now();
     let _tc = thread_cnts.clone();
-    let result = array.block_on(
-        array
-            .local_iter()
-            .filter(|e| e.load() % 2 == 0)
-            .count_with_schedule(schedule),
-    );
+    let result = array
+        .local_iter()
+        .filter(|e| e.load() % 2 == 0)
+        .count_with_schedule(schedule.clone())
+        .block();
     array.barrier();
-    println!("elapsed time {:?}", timer.elapsed().as_secs_f64());
+    println!(
+        "count {schedule:?} block elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
     println!("count {:?}", result);
+
+    let timer = Instant::now();
+    let _tc = thread_cnts.clone();
+    let result_handle = array
+        .local_iter()
+        .filter(|e| e.load() % 2 == 0)
+        .count_with_schedule(schedule.clone())
+        .spawn();
+    array.wait_all();
+    array.barrier();
+    println!(
+        "count {schedule:?} spawn elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
+    println!("count {:?}", result_handle.block());
+
+    array.block_on(async move {
+        let timer = Instant::now();
+        let _tc = thread_cnts.clone();
+        let result = array
+            .local_iter()
+            .filter(|e| e.load() % 2 == 0)
+            .count_with_schedule(schedule.clone())
+            .await;
+        array.async_barrier().await;
+        println!(
+            "count {schedule:?} await elapsed time {:?}",
+            timer.elapsed().as_secs_f64()
+        );
+        println!("count {:?}", result);
+    });
 }
 
 fn sum_with_schedule(
@@ -93,24 +242,60 @@ fn sum_with_schedule(
 ) {
     let timer = Instant::now();
     let _tc = thread_cnts.clone();
-    let result = array.block_on(
-        array
+    let result = array
+        .local_iter()
+        .map(|e| e.load())
+        .filter(|e| e % 2 == 0)
+        .sum_with_schedule(schedule.clone())
+        .block();
+    array.barrier();
+    println!(
+        "sum {schedule:?} block elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
+    println!("sum {:?}", result);
+
+    let timer = Instant::now();
+    let _tc = thread_cnts.clone();
+    let result_handle = array
+        .local_iter()
+        .map(|e| e.load())
+        .filter(|e| e % 2 == 0)
+        .sum_with_schedule(schedule.clone())
+        .spawn();
+    array.wait_all();
+    array.barrier();
+    println!(
+        "sum {schedule:?} spawn elapsed time {:?}",
+        timer.elapsed().as_secs_f64()
+    );
+    println!("sum {:?}", result_handle.block());
+
+    array.block_on(async move {
+        let timer = Instant::now();
+        let _tc = thread_cnts.clone();
+        let result = array
             .local_iter()
             .map(|e| e.load())
             .filter(|e| e % 2 == 0)
-            .sum_with_schedule(schedule),
-    );
-    array.barrier();
-    println!("elapsed time {:?}", timer.elapsed().as_secs_f64());
-    println!("sum {:?}", result);
+            .sum_with_schedule(schedule.clone())
+            .await;
+        array.async_barrier().await;
+        println!(
+            "sum {schedule:?} await elapsed time {:?}",
+            timer.elapsed().as_secs_f64()
+        );
+        println!("sum {:?}", result);
+    });
 }
 
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     println!("world created");
     let _my_pe = world.my_pe();
-    let _num_pes = world.num_pes();
-    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let num_pes = world.num_pes();
+    let block_array =
+        AtomicArray::<usize>::new(world.team(), ARRAY_LEN * num_pes, Distribution::Block);
     println!("array created");
     block_array.print();
     let _ = block_array
diff --git a/examples/array_examples/array_ops.rs b/examples/array_examples/array_ops.rs
index af52afa4..61763bf5 100644
--- a/examples/array_examples/array_ops.rs
+++ b/examples/array_examples/array_ops.rs
@@ -100,7 +100,7 @@ fn test_add<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_add(i, add_val));
+        reqs.push(array.fetch_add(i, add_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!(
@@ -137,7 +137,7 @@ fn test_sub<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_sub(i, sub_val));
+        reqs.push(array.fetch_sub(i, sub_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -168,7 +168,7 @@ fn test_mul<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_mul(i, mul_val));
+        reqs.push(array.fetch_mul(i, mul_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -200,7 +200,7 @@ fn test_div<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_div(i, div_val));
+        reqs.push(array.fetch_div(i, div_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -232,7 +232,7 @@ fn test_rem<T: std::fmt::Debug + ElementArithmeticOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_rem(i, rem_val));
+        reqs.push(array.fetch_rem(i, rem_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -270,7 +270,7 @@ fn test_and<T: std::fmt::Debug + ElementArithmeticOps + ElementBitWiseOps + 'sta
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_bit_and(i, and_val));
+        reqs.push(array.fetch_bit_and(i, and_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -308,7 +308,7 @@ fn test_or<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_bit_or(i, or_val));
+        reqs.push(array.fetch_bit_or(i, or_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -346,7 +346,7 @@ fn test_xor<T: std::fmt::Debug + ElementBitWiseOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_bit_xor(i, xor_val));
+        reqs.push(array.fetch_bit_xor(i, xor_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -381,7 +381,7 @@ fn test_store_load<T: std::fmt::Debug + ElementOps + 'static>(
 
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.load(i));
+        reqs.push(array.load(i).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -413,7 +413,7 @@ fn test_shl<T: std::fmt::Debug + ElementShiftOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_shl(i, shl_val));
+        reqs.push(array.fetch_shl(i, shl_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
@@ -445,7 +445,7 @@ fn test_shr<T: std::fmt::Debug + ElementShiftOps + 'static>(
     array.barrier();
     let mut reqs = vec![];
     for i in 0..array.len() {
-        reqs.push(array.fetch_shr(i, shr_val));
+        reqs.push(array.fetch_shr(i, shr_val).spawn());
     }
     for (i, req) in reqs.drain(0..).enumerate() {
         println!("i: {:?} {:?}", i, array.block_on(req));
diff --git a/examples/array_examples/atomic_compare_exchange.rs b/examples/array_examples/atomic_compare_exchange.rs
index e9fa04dd..452168b2 100644
--- a/examples/array_examples/atomic_compare_exchange.rs
+++ b/examples/array_examples/atomic_compare_exchange.rs
@@ -25,7 +25,6 @@ fn main() {
 
     let array = AtomicArray::<usize>::new(world.team(), num_pes * 2, Distribution::Block);
     array.dist_iter_mut().for_each(|x| x.store(0)).block(); //initialize array -- use atomic store
-    array.wait_all();
     array.barrier();
 
     // array.print();
@@ -55,7 +54,9 @@ fn main() {
     let old = 0.0;
     let new = (my_pe + 1) as f32;
     let epsilon = 0.00001;
+    println!("here 1");
     let res = world.block_on(array_2.batch_compare_exchange_epsilon(indices, old, new, epsilon)); //should not fail
+    println!("here 2");
     array_2.barrier();
 
     let (num_failed, num_ok) = res.iter().fold((0, 0), |acc, x| {
diff --git a/impl/src/array_ops.rs b/impl/src/array_ops.rs
index f39452ce..7dbedf24 100644
--- a/impl/src/array_ops.rs
+++ b/impl/src/array_ops.rs
@@ -861,6 +861,7 @@ fn create_buf_ops(
             #[#am(AmGroup(false))]
             impl LamellarAM for #multi_val_multi_idx_am_buf_name{ //eventually we can return fetchs here too...
                 async fn exec(&self) {
+                    // println!("in multi val multi idx exec");
                     #slice
                     match self.index_size{
                         1 => {
@@ -926,6 +927,7 @@ fn create_buf_ops(
             #[#am(AmGroup(false))]
             impl LamellarAM for #single_val_multi_idx_am_buf_name{ //eventually we can return fetchs here too...
                 async fn exec(&self) {
+                    // println!("in single val multi idx exec");
                     #slice
                     let val = self.val;
                     match self.index_size{
@@ -995,6 +997,7 @@ fn create_buf_ops(
             #[#am(AmGroup(false))]
             impl LamellarAM for #multi_val_single_idx_am_buf_name{ //eventually we can return fetchs here too...
                 async fn exec(&self) {
+                    // println!("in multi val single idx exec");
                     #slice
                     let vals = unsafe {std::slice::from_raw_parts(self.vals.as_ptr() as *const #typeident, self.vals.len()/std::mem::size_of::<#typeident>())};
                     let index = self.index;
@@ -1036,6 +1039,7 @@ fn create_buf_ops(
                 #[#am(AmGroup(false))]
                 impl LamellarAM for #multi_val_multi_idx_am_buf_result_name{ //eventually we can return fetchs here too...
                     async fn exec(&self) -> Vec<Result<#typeident,#typeident>> {
+                        // println!("in multi val multi idx result exec");
                         #slice
                         let mut res = Vec::new();
                         match self.index_size{
@@ -1103,6 +1107,7 @@ fn create_buf_ops(
                 #[#am(AmGroup(false))]
                 impl LamellarAM for #single_val_multi_idx_am_buf_result_name{ //eventually we can return fetchs here too...
                     async fn exec(&self) -> Vec<Result<#typeident,#typeident>> {
+                        // println!("in single val multi idx result exec");
                         #slice
                         let val = self.val;
                         let mut res = Vec::new();
@@ -1138,6 +1143,7 @@ fn create_buf_ops(
                                 }
                             }
                         }
+                        // println!("done in in single val multi idx result exec");
                         res
                     }
                 }
@@ -1173,6 +1179,7 @@ fn create_buf_ops(
                 #[#am(AmGroup(false))]
                 impl LamellarAM for #multi_val_single_idx_am_buf_result_name{ //eventually we can return fetchs here too...
                     async fn exec(&self) -> Vec<Result<#typeident,#typeident>>  {
+                        // println!("in multi val single idx result exec");
                         #slice
                         let vals = unsafe {std::slice::from_raw_parts(self.vals.as_ptr() as *const #typeident, self.vals.len()/std::mem::size_of::<#typeident>())};
                         let index = self.index;
@@ -1217,6 +1224,7 @@ fn create_buf_ops(
         #[#am(AmGroup(false))]
         impl LamellarAM for #multi_val_multi_idx_am_buf_fetch_name{ //eventually we can return fetchs here too...
             async fn exec(&self) -> Vec<#typeident> {
+                // println!("in multi val multi idx fetch exec");
                 #slice
                 let mut res = Vec::new();
                 match self.index_size{
@@ -1287,7 +1295,7 @@ fn create_buf_ops(
         #[#am(AmGroup(false))]
         impl LamellarAM for #single_val_multi_idx_am_buf_fetch_name{ //eventually we can return fetchs here too...
             async fn exec(&self) -> Vec<#typeident>{
-                // println!("in single val multi idx exec");
+                // println!("in single val multi idx fetch exec");
                 #slice
                 let val = self.val;
                 let mut res;
@@ -1329,6 +1337,7 @@ fn create_buf_ops(
                         }
                     }
                 }
+                // println!("done with exec");
                 res
             }
         }
@@ -1367,6 +1376,7 @@ fn create_buf_ops(
         #[#am(AmGroup(false))]
         impl LamellarAM for #multi_val_single_idx_am_buf_fetch_name{ //eventually we can return fetchs here too...
             async fn exec(&self) -> Vec<#typeident> {
+                // println!("in multi val single idx fetch exec");
                 #slice
                 let vals = unsafe {std::slice::from_raw_parts(self.vals.as_ptr() as *const #typeident, self.vals.len()/std::mem::size_of::<#typeident>())};
                 let index = self.index;
diff --git a/run_examples.sh b/run_examples.sh
index 2cefa297..4009a457 100755
--- a/run_examples.sh
+++ b/run_examples.sh
@@ -18,7 +18,7 @@ mkdir -p ${results_dir}
 ln -s ${output_dir}/rofiverbs_lamellae rofiverbs_lamellae
 
 
-cargo build --release --features enable-rofi --features tokio-executor --examples -j 20
+cargo build --release --features enable-rofi --features tokio-executor --features runtime-warnings-panic --examples -j 20
 
 
 cd rofiverbs_lamellae/${local_results_dir}
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index 1108751b..b26cd372 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -140,6 +140,7 @@ impl<T: AmDist> AmHandle<T> {
                 tg_counters.inc_launched(num_pes);
             }
             self.inner.scheduler.submit_am(am);
+            // println!("am spawned");
         }
     }
     /// This method will spawn the associated Active Message on the work queue,
@@ -149,7 +150,7 @@ impl<T: AmDist> AmHandle<T> {
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(mut self) -> LamellarTask<T> {
         self.launch_am_if_needed();
-        self.inner.scheduler.clone().spawn_task(self)
+        self.inner.scheduler.clone().spawn_task(self, Vec::new()) //AM handles counters
     }
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(mut self) -> T {
@@ -172,6 +173,7 @@ impl<T: AmDist> LamellarRequest for AmHandle<T> {
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
         self.launch_am_if_needed();
         let mut cur_waker = self.inner.waker.lock();
+
         if self.inner.ready.load(Ordering::SeqCst) {
             true
         } else {
@@ -276,7 +278,7 @@ impl<T: Send + 'static> LocalAmHandle<T> {
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(mut self) -> LamellarTask<T> {
         self.launch_am_if_needed();
-        self.inner.scheduler.clone().spawn_task(self)
+        self.inner.scheduler.clone().spawn_task(self, Vec::new()) //AM handles counters)
     }
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(mut self) -> T {
@@ -471,7 +473,7 @@ impl<T: AmDist> MultiAmHandle<T> {
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(mut self) -> LamellarTask<Vec<T>> {
         self.launch_am_if_needed();
-        self.inner.scheduler.clone().spawn_task(self)
+        self.inner.scheduler.clone().spawn_task(self, Vec::new()) //AM handles counters
     }
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(mut self) -> Vec<T> {
diff --git a/src/array.rs b/src/array.rs
index 68cb8f00..e58dcefc 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -190,31 +190,31 @@ pub struct ReduceKey {
 }
 crate::inventory::collect!(ReduceKey);
 
-impl Dist for bool {}
-lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize, isize);
-lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize, isize);
+// impl Dist for bool {}
+// lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 
 // lamellar_impl::generate_reductions_for_type_rt!(false, u128);
 // lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
-// //------------------------------------
+// // //------------------------------------
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, u128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
+lamellar_impl::generate_reductions_for_type_rt!(true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_reductions_for_type_rt!(false, u128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, u16, u32, u64, usize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, u128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_reductions_for_type_rt!(false, i128);
-// lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
-// lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
+lamellar_impl::generate_reductions_for_type_rt!(true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_reductions_for_type_rt!(false, i128);
+lamellar_impl::generate_ops_for_type_rt!(true, true, true, i8, i16, i32, i64, isize);
+lamellar_impl::generate_ops_for_type_rt!(true, false, true, i128);
 
-// lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
-// lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
+lamellar_impl::generate_reductions_for_type_rt!(false, f32, f64);
+lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32, f64);
 
-// lamellar_impl::generate_ops_for_bool_rt!();
+lamellar_impl::generate_ops_for_bool_rt!();
 
 impl<T: Dist + ArrayOps> Dist for Option<T> {}
 impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
@@ -656,6 +656,29 @@ impl LamellarByteArray {
             LamellarByteArray::GlobalLockArray(array) => array.array.inner.data.team(),
         }
     }
+    pub(crate) fn dec_outstanding(&self, num: usize) {
+        match self {
+            LamellarByteArray::UnsafeArray(array) => {
+                array.inner.data.array_counters.dec_outstanding(num)
+            }
+            LamellarByteArray::ReadOnlyArray(array) => {
+                array.array.inner.data.array_counters.dec_outstanding(num)
+            }
+            LamellarByteArray::AtomicArray(array) => array.dec_outstanding(num),
+            LamellarByteArray::NativeAtomicArray(array) => {
+                array.array.inner.data.array_counters.dec_outstanding(num)
+            }
+            LamellarByteArray::GenericAtomicArray(array) => {
+                array.array.inner.data.array_counters.dec_outstanding(num)
+            }
+            LamellarByteArray::LocalLockArray(array) => {
+                array.array.inner.data.array_counters.dec_outstanding(num)
+            }
+            LamellarByteArray::GlobalLockArray(array) => {
+                array.array.inner.data.array_counters.dec_outstanding(num)
+            }
+        }
+    }
 }
 
 impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarReadArray<T> {
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index dd75b7d5..a3169e17 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -675,6 +675,17 @@ impl AtomicByteArray {
             AtomicByteArray::GenericAtomicByteArray(array) => array.array.inner.data.team(),
         }
     }
+
+    pub(crate) fn dec_outstanding(&self, num: usize) {
+        match self {
+            AtomicByteArray::NativeAtomicByteArray(array) => {
+                array.array.inner.data.array_counters.dec_outstanding(num)
+            }
+            AtomicByteArray::GenericAtomicByteArray(array) => {
+                array.array.inner.data.array_counters.dec_outstanding(num)
+            }
+        }
+    }
 }
 
 impl crate::active_messaging::DarcSerde for AtomicByteArray {
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 60d3ea08..20e89cbb 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -20,6 +20,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
@@ -33,6 +34,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
             array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
+            spawned: false,
         }
     }
 }
@@ -48,6 +50,7 @@ impl<T: Dist> LamellarArrayGet<T> for GenericAtomicArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
@@ -70,6 +73,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GenericAtomicArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
 }
@@ -85,6 +89,7 @@ impl<T: Dist> LamellarArrayPut<T> for GenericAtomicArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 1a2f5cb1..51beb48e 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -1012,6 +1012,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for GlobalLockArray<T> {
 }
 
 //#[doc(hidden)]
+// Dropped Handle Warning triggered by AmHandle
 #[pin_project]
 pub struct GlobalLockArrayReduceHandle<T: Dist + AmDist> {
     req: AmHandle<Option<T>>,
diff --git a/src/array/global_lock_atomic/handle.rs b/src/array/global_lock_atomic/handle.rs
index 823ff0bd..4beec56a 100644
--- a/src/array/global_lock_atomic/handle.rs
+++ b/src/array/global_lock_atomic/handle.rs
@@ -18,7 +18,7 @@ use super::{
 };
 
 #[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by GlobalRwDarcReadHandle
 /// Handle used to retrieve the aquired read lock of a GlobalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -111,7 +111,7 @@ impl<T: Dist> Future for GlobalLockReadHandle<T> {
 }
 
 #[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by GlobalRwDarcReadHandle
 /// Handle used to retrieve the aquired local data [GlobalLockLocalData] of  a GlobalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -184,7 +184,7 @@ impl<T: Dist> GlobalLockLocalDataHandle<T> {
     /// println!("local data: {:?}",local_data);
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<GlobalLockLocalData<T>> {
+    pub fn spawn(self) -> LamellarTask<GlobalLockLocalData<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -207,7 +207,7 @@ impl<T: Dist> Future for GlobalLockLocalDataHandle<T> {
 }
 
 #[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by GlobalRwDarcWriteHandle
 /// Handle used to retrieve the aquired write lock of a GlobalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -280,7 +280,7 @@ impl<T: Dist> GlobalLockWriteHandle<T> {
     /// let guard = task.block();
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<GlobalLockWriteGuard<T>> {
+    pub fn spawn(self) -> LamellarTask<GlobalLockWriteGuard<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -298,9 +298,8 @@ impl<T: Dist> Future for GlobalLockWriteHandle<T> {
         }
     }
 }
-
 #[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by GlobalRwDarcWriteHandle
 /// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of  a GlobalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -374,7 +373,7 @@ impl<T: Dist> GlobalLockMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<GlobalLockMutLocalData<T>> {
+    pub fn spawn(self) -> LamellarTask<GlobalLockMutLocalData<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -397,7 +396,7 @@ impl<T: Dist> Future for GlobalLockMutLocalDataHandle<T> {
 }
 
 #[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by GlobalRwDarcCollectiveWriteHandle
 /// Handle used to retrieve the aquired mutable local data [GlobalLockMutLocalData] of a GlobalLockArray with all PEs collectively accessing their local data
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -472,7 +471,7 @@ impl<T: Dist> GlobalLockCollectiveMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<GlobalLockCollectiveMutLocalData<T>> {
+    pub fn spawn(self) -> LamellarTask<GlobalLockCollectiveMutLocalData<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index dfa53282..c38adfa3 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -30,6 +30,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
@@ -43,6 +44,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
             array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
+            spawned: false,
         }
     }
 }
@@ -58,6 +60,7 @@ impl<T: Dist> LamellarArrayGet<T> for GlobalLockArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
@@ -80,6 +83,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for GlobalLockArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
 }
@@ -95,6 +99,7 @@ impl<T: Dist> LamellarArrayPut<T> for GlobalLockArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
diff --git a/src/array/handle.rs b/src/array/handle.rs
index e0578df3..c80eb0af 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -5,16 +5,33 @@ use std::{
     task::{Context, Poll, Waker},
 };
 
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 
 use crate::{
-    active_messaging::{AmHandle, LocalAmHandle}, array::LamellarByteArray, lamellar_request::LamellarRequest, scheduler::LamellarTask, warnings::RuntimeWarning, Dist, OneSidedMemoryRegion, RegisteredMemoryRegion
+    active_messaging::{AmHandle, LocalAmHandle},
+    array::LamellarByteArray,
+    lamellar_request::LamellarRequest,
+    scheduler::LamellarTask,
+    warnings::RuntimeWarning,
+    Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
 };
 
 /// a task handle for an array rdma (put/get) operation
 pub struct ArrayRdmaHandle {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) reqs: VecDeque<AmHandle<()>>,
+    pub(crate) spawned: bool,
+}
+
+impl Drop for ArrayRdmaHandle {
+    fn drop(&mut self) {
+        if !self.spawned {
+            RuntimeWarning::disable_warnings();
+            for _ in self.reqs.drain(0..) {}
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("an ArrayRdmaHandle").print();
+        }
+    }
 }
 
 impl ArrayRdmaHandle {
@@ -40,6 +57,7 @@ impl ArrayRdmaHandle {
 
 impl LamellarRequest for ArrayRdmaHandle {
     fn blocking_wait(mut self) -> Self::Output {
+        self.spawned = true;
         for req in self.reqs.drain(0..) {
             req.blocking_wait();
         }
@@ -49,6 +67,7 @@ impl LamellarRequest for ArrayRdmaHandle {
         for req in self.reqs.iter_mut() {
             ready &= req.ready_or_set_waker(waker);
         }
+        self.spawned = true;
         ready
     }
     fn val(&self) -> Self::Output {
@@ -61,6 +80,12 @@ impl LamellarRequest for ArrayRdmaHandle {
 impl Future for ArrayRdmaHandle {
     type Output = ();
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         while let Some(mut req) = self.reqs.pop_front() {
             if !req.ready_or_set_waker(cx.waker()) {
                 self.reqs.push_front(req);
@@ -72,11 +97,24 @@ impl Future for ArrayRdmaHandle {
 }
 
 /// a task handle for an array rdma 'at' operation
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct ArrayRdmaAtHandle<T: Dist> {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
     pub(crate) req: Option<LocalAmHandle<()>>,
     pub(crate) buf: OneSidedMemoryRegion<T>,
+    pub(crate) spawned: bool,
+}
+
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for ArrayRdmaAtHandle<T> {
+    fn drop(mut self: Pin<&mut Self>) {
+        if !self.spawned {
+            RuntimeWarning::disable_warnings();
+            let _ = self.req.take();
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("an ArrayRdmaAtHandle").print();
+        }
+    }
 }
 
 impl<T: Dist> ArrayRdmaAtHandle<T> {
@@ -85,12 +123,14 @@ impl<T: Dist> ArrayRdmaAtHandle<T> {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<T> {
+    pub fn spawn(mut self) -> LamellarTask<T> {
+        self.spawned = true;
         self.array.team().spawn(self)
     }
 
     /// This method will block the calling thread until the associated Array RDMA at Operation completes
-    pub fn block(self) -> T {
+    pub fn block(mut self) -> T {
+        self.spawned = true;
         RuntimeWarning::BlockingCall(
             "ArrayRdmaAtHandle::block",
             "<handle>.spawn() or <handle>.await",
@@ -101,14 +141,19 @@ impl<T: Dist> ArrayRdmaAtHandle<T> {
 }
 
 impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
-    fn blocking_wait(self) -> Self::Output {
-        match self.req {
-            Some(req) => req.blocking_wait(),
-            None => {} //this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
+    fn blocking_wait(mut self) -> Self::Output {
+        self.spawned = true;
+        if let Some(req) = self.req.take() {
+            req.blocking_wait();
         }
+        // match self.req {
+        //     Some(req) => req.blocking_wait(),
+        //     None => {} //this means we did a blocking_get (With respect to RDMA) on either Unsafe or ReadOnlyArray so data is here
+        // }
         unsafe { self.buf.as_slice().expect("Data should exist on PE")[0] }
     }
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+        self.spawned = true;
         if let Some(req) = &mut self.req {
             req.ready_or_set_waker(waker)
         } else {
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 64ce3f02..1a6abdf2 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -15,11 +15,11 @@ use crate::warnings::RuntimeWarning;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Collect<I, A> {
@@ -74,6 +74,7 @@ where
             distribution: self.distribution,
             team,
             state: InnerState::ReqsPending(Vec::new()),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -135,6 +136,7 @@ where
             distribution: self.distribution,
             team,
             state: InnerState::ReqsPending(Vec::new()),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -163,6 +165,7 @@ pub(crate) struct InnerDistIterCollectHandle<T, A> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
     pub(crate) distribution: Distribution,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    spawned: bool,
     state: InnerState<T, A>,
 }
 
@@ -195,7 +198,13 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
     for InnerDistIterCollectHandle<T, A>
 {
     type Output = A;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(ref mut vals) => {
@@ -234,49 +243,27 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
     }
 }
 
-impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
-    LamellarRequest for InnerDistIterCollectHandle<T, A>
-{
-    fn blocking_wait(mut self) -> Self::Output {
-        // let mut num_local_vals = 0;
-        let mut temp_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.blocking_wait();
-            temp_vals.extend(v);
-        }
-        temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
-        let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(local_vals)
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-    fn val(&self) -> Self::Output {
-        // let mut num_local_vals = 0;
-        let mut temp_vals = vec![];
-        for req in self.reqs.iter() {
-            let v = req.val();
-            temp_vals.extend(v);
-        }
-        temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
-        let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(local_vals)
-    }
-}
-
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct DistIterCollectHandle<T, A> {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State<T, A>,
 }
 
+#[pinned_drop]
+impl<T, A> PinnedDrop for DistIterCollectHandle<T, A> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a DistIterCollectHandle").print();
+        }
+    }
+}
+
 impl<T, A> DistIterCollectHandle<T, A>
 where
     T: Dist + ArrayOps,
@@ -288,19 +275,21 @@ where
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Barrier(barrier_handle, inner),
         }
     }
 
     /// This method will block until the associated Collect operation completes and returns the result
-    pub fn block(self) -> A {
+    pub fn block(mut self) -> A {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "DistIterCollectHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+        self.array.clone().block_on(self)
     }
 
     /// This method will spawn the associated Collect Operation on the work queue,
@@ -308,8 +297,9 @@ where
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<A> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<A> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -320,6 +310,7 @@ enum State<T, A> {
         Pin<Box<dyn Future<Output = InnerDistIterCollectHandle<T, A>> + Send>>,
     ),
     Reqs(#[pin] InnerDistIterCollectHandle<T, A>),
+    Dropped,
 }
 impl<T, A> Future for DistIterCollectHandle<T, A>
 where
@@ -327,7 +318,8 @@ where
     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
 {
     type Output = A;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Barrier(barrier, inner) => {
@@ -345,43 +337,7 @@ where
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
-        }
-    }
-}
-
-//#[doc(hidden)]
-impl<T, A> LamellarRequest for DistIterCollectHandle<T, A>
-where
-    T: Dist + ArrayOps,
-    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
-{
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Barrier(barrier, reqs) => {
-                barrier.blocking_wait();
-                self.team.block_on(reqs).blocking_wait()
-            }
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Barrier(barrier, _) => {
-                if !barrier.ready_or_set_waker(waker) {
-                    return false;
-                }
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Barrier(_barrier, _reqs) => {
-                unreachable!("should never be in barrier state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
+            StateProj::Dropped => panic!("called `Future::poll()` on a future that was dropped"),
         }
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index 6588000c..ca826655 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -13,16 +13,15 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 use crate::Darc;
 
-use async_trait::async_trait;
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::{
     atomic::{AtomicUsize, Ordering},
     Arc,
 };
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Count<I> {
@@ -68,6 +67,7 @@ where
             reqs,
             team,
             state: InnerState::ReqsPending(0),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -81,6 +81,7 @@ pub(crate) struct InnerDistIterCountHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
     team: Pin<Arc<LamellarTeamRT>>,
     state: InnerState,
+    spawned: bool,
 }
 
 enum InnerState {
@@ -131,7 +132,13 @@ impl InnerDistIterCountHandle {
 
 impl Future for InnerDistIterCountHandle {
     type Output = usize;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(cnt) => {
@@ -162,49 +169,27 @@ impl Future for InnerDistIterCountHandle {
     }
 }
 
-//#[doc(hidden)]
-#[async_trait]
-impl LamellarRequest for InnerDistIterCountHandle {
-    fn blocking_wait(mut self) -> Self::Output {
-        self.team.tasking_barrier();
-        let cnt = Darc::new(&self.team, AtomicUsize::new(0)).unwrap();
-        let count = self
-            .reqs
-            .drain(..)
-            .map(|req| req.blocking_wait())
-            .into_iter()
-            .sum::<usize>();
-        self.reduce_remote_counts(count, cnt)
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-    fn val(&self) -> Self::Output {
-        self.team.tasking_barrier();
-        let cnt = Darc::new(&self.team, AtomicUsize::new(0)).unwrap();
-        let count = self
-            .reqs
-            .iter()
-            .map(|req| req.val())
-            .into_iter()
-            .sum::<usize>();
-        self.reduce_remote_counts(count, cnt)
-    }
-}
-
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct DistIterCountHandle {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State,
 }
 
+#[pinned_drop]
+impl PinnedDrop for DistIterCountHandle {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a DistIterCountHandle").print();
+        }
+    }
+}
+
 impl DistIterCountHandle {
     pub(crate) fn new(
         barrier_handle: BarrierHandle,
@@ -212,19 +197,21 @@ impl DistIterCountHandle {
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Barrier(barrier_handle, inner),
         }
     }
 
     /// This method will block until the associated Count operation completes and returns the result
-    pub fn block(self) -> usize {
+    pub fn block(mut self) -> usize {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "DistIterCountHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+        self.array.clone().block_on(self)
     }
 
     /// This method will spawn the associated Count Operation on the work queue,
@@ -232,8 +219,9 @@ impl DistIterCountHandle {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<usize> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<usize> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -244,10 +232,12 @@ enum State {
         Pin<Box<dyn Future<Output = InnerDistIterCountHandle> + Send>>,
     ),
     Reqs(#[pin] InnerDistIterCountHandle),
+    Dropped,
 }
 impl Future for DistIterCountHandle {
     type Output = usize;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Barrier(barrier, inner) => {
@@ -265,39 +255,7 @@ impl Future for DistIterCountHandle {
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
-        }
-    }
-}
-
-//#[doc(hidden)]
-impl LamellarRequest for DistIterCountHandle {
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Barrier(barrier, reqs) => {
-                barrier.blocking_wait();
-                self.team.block_on(reqs).blocking_wait()
-            }
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Barrier(barrier, _) => {
-                if !barrier.ready_or_set_waker(waker) {
-                    return false;
-                }
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Barrier(_barrier, _reqs) => {
-                unreachable!("should never be in barrier state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
+            StateProj::Dropped => panic!("should never be in dropped state"),
         }
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index 08df2314..750100ee 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -11,11 +11,11 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct ForEach<I, F>
@@ -70,7 +70,10 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        InnerDistIterForEachHandle { reqs }
+        InnerDistIterForEachHandle {
+            reqs,
+            spawned: false,
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -135,7 +138,10 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        InnerDistIterForEachHandle { reqs }
+        InnerDistIterForEachHandle {
+            reqs,
+            spawned: false,
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -158,11 +164,18 @@ where
 
 pub(crate) struct InnerDistIterForEachHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
+    spawned: bool,
 }
 
 impl Future for InnerDistIterForEachHandle {
     type Output = ();
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         while let Some(mut req) = self.reqs.pop_front() {
             if !req.ready_or_set_waker(cx.waker()) {
                 self.reqs.push_front(req);
@@ -174,37 +187,27 @@ impl Future for InnerDistIterForEachHandle {
 }
 
 //#[doc(hidden)]
-impl LamellarRequest for InnerDistIterForEachHandle {
-    fn blocking_wait(mut self) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.blocking_wait();
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-    fn val(&self) -> Self::Output {
-        for req in self.reqs.iter() {
-            req.val();
-        }
-    }
-}
-
-//#[doc(hidden)]
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct DistIterForEachHandle {
-    // pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State,
 }
 
+#[pinned_drop]
+impl PinnedDrop for DistIterForEachHandle {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a DistIterForEachHandle").print();
+        }
+    }
+}
+
 impl DistIterForEachHandle {
     pub(crate) fn new(
         barrier: BarrierHandle,
@@ -212,27 +215,30 @@ impl DistIterForEachHandle {
         array: &UnsafeArrayInner,
     ) -> Self {
         DistIterForEachHandle {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Barrier(barrier, reqs),
         }
     }
 
     /// This method will block until the associated For Each operation completes and returns the result
-    pub fn block(self) {
+    pub fn block(mut self) {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "DistIterForEachHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self);
+        self.array.clone().block_on(self);
     }
     /// This method will spawn the associated  For Each Operation on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<()> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<()> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -243,11 +249,13 @@ enum State {
         Pin<Box<dyn Future<Output = InnerDistIterForEachHandle> + Send>>,
     ),
     Reqs(#[pin] InnerDistIterForEachHandle, usize),
+    Dropped,
 }
 
 impl Future for DistIterForEachHandle {
     type Output = ();
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Barrier(barrier, inner) => {
@@ -288,41 +296,9 @@ impl Future for DistIterForEachHandle {
                     Poll::Pending => Poll::Pending,
                 }
             }
-        }
-    }
-}
-
-//#[doc(hidden)]
-impl LamellarRequest for DistIterForEachHandle {
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Barrier(barrier, reqs) => {
-                barrier.blocking_wait();
-                self.team.block_on(reqs).blocking_wait();
-            }
-            State::Reqs(inner, _) => {
-                inner.blocking_wait();
-            }
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Barrier(barrier, _) => {
-                if !barrier.ready_or_set_waker(waker) {
-                    return false;
-                }
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner, _) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Barrier(_barrier, _reqs) => {
-                unreachable!("should never be in barrier state when val is called");
+            StateProj::Dropped => {
+                panic!("called `Future::poll` on a dropped `DistIterForEachHandle`")
             }
-            State::Reqs(inner, _) => inner.val(),
         }
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index fb06f9e6..5cbac26b 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -14,11 +14,11 @@ use crate::warnings::RuntimeWarning;
 use crate::Dist;
 
 use futures_util::{ready, Future, StreamExt};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Reduce<I, F> {
@@ -71,100 +71,13 @@ where
             reqs,
             team,
             state: InnerState::ReqsPending(None),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
     }
 }
-
-// #[derive(Debug)]
-// pub(crate) struct ReduceAsync<I, T, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Future<Output = T> + Send + 'static,
-//     T: Dist + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     pub(crate) iter: I,
-//     pub(crate) op: F,
-//     // pub(crate) _phantom: PhantomData<T>,
-// }
-
-// impl<I, T, F> IterClone for ReduceAsync<I, T, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Future<Output = T> + SyncSend + 'static,
-//     T: Dist + Send + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     fn iter_clone(&self, _: Sealed) -> Self {
-//         ReduceAsync {
-//             iter: self.iter.iter_clone(Sealed),
-//             op: self.op.clone(),
-//         }
-//     }
-// }
-
-// impl<I, T, F> IterConsumer for ReduceAsync<I, T, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Future<Output = T> + Send + 'static,
-//     T: Dist + Send + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     type AmOutput = Option<T>;
-//     type Output = Option<T>;
-//     type Item = I::Item;
-//     type Handle = InnerDistIterReduceHandle<I::Item, F>;
-//     fn init(&self, start: usize, cnt: usize) -> Self {
-//         ReduceAsync {
-//             iter: self.iter.init(start, cnt),
-//             op: self.op.clone(),
-//         }
-//     }
-//     fn next(&mut self) -> Option<Self::Item> {
-//         self.iter.next()
-//     }
-//     fn into_am(&self, schedule: IterSchedule) -> LamellarArcLocalAm {
-//         Arc::new(ReduceAsyncAm {
-//             iter: self.iter_clone(Sealed),
-//             op: self.op.clone(),
-//             schedule,
-//         })
-//     }
-//     fn create_handle(
-//         self,
-//         team: Pin<Arc<LamellarTeamRT>>,
-//         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
-//     ) -> Self::Handle {
-//         InnerDistIterReduceHandle {
-//             op: self.op,
-//             reqs,
-//             team,
-//             state: InnerState::ReqsPending(None),
-//         }
-//     }
-//     fn max_elems(&self, in_elems: usize) -> usize {
-//         self.iter.elems(in_elems)
-//     }
-// }
-
-// impl<I, T, F> Clone for ReduceAsync<I, T, F>
-// where
-//     I: DistributedIterator + Clone,
-//     I::Item: Future<Output = T> + Send + 'static,
-//     T: Dist + Send + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     fn clone(&self) -> Self {
-//         ReduceAsync {
-//             iter: self.iter.clone(),
-//             op: self.op.clone(),
-//         }
-//     }
-// }
-
 //#[doc(hidden)]
 #[pin_project]
 pub(crate) struct InnerDistIterReduceHandle<T, F> {
@@ -172,6 +85,7 @@ pub(crate) struct InnerDistIterReduceHandle<T, F> {
     pub(crate) op: F,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
     state: InnerState<T>,
+    spawned: bool,
 }
 
 enum InnerState<T> {
@@ -230,7 +144,13 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     type Output = Option<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(mut val) => {
@@ -273,55 +193,27 @@ where
     }
 }
 
-//#[doc(hidden)]
-impl<T, F> LamellarRequest for InnerDistIterReduceHandle<T, F>
-where
-    T: Dist + ArrayOps,
-    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
-{
-    fn blocking_wait(mut self) -> Self::Output {
-        let local_val = self
-            .reqs
-            .drain(..)
-            .filter_map(|req| req.blocking_wait())
-            .reduce(self.op.clone());
-        if let Some(val) = local_val {
-            self.reduce_remote_vals(val)
-        } else {
-            None
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-
-    fn val(&self) -> Self::Output {
-        let local_val = self
-            .reqs
-            .iter()
-            .filter_map(|req| req.val())
-            .reduce(self.op.clone());
-        if let Some(val) = local_val {
-            self.reduce_remote_vals(val)
-        } else {
-            None
-        }
-    }
-}
-
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct DistIterReduceHandle<T, F> {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State<T, F>,
 }
 
+#[pinned_drop]
+impl<T, F> PinnedDrop for DistIterReduceHandle<T, F> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a DistIterReduceHandle").print();
+        }
+    }
+}
+
 impl<T, F> DistIterReduceHandle<T, F>
 where
     T: Dist + ArrayOps,
@@ -333,19 +225,21 @@ where
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Barrier(barrier, reqs),
         }
     }
 
     /// This method will block until the associated Reduce operation completes and returns the result
-    pub fn block(self) -> Option<T> {
+    pub fn block(mut self) -> Option<T> {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "DistIterReduceHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+        self.array.clone().block_on(self)
     }
 
     /// This method will spawn the associated Reduce Operation on the work queue,
@@ -353,8 +247,9 @@ where
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<Option<T>> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<Option<T>> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -365,6 +260,7 @@ enum State<T, F> {
         Pin<Box<dyn Future<Output = InnerDistIterReduceHandle<T, F>> + Send>>,
     ),
     Reqs(#[pin] InnerDistIterReduceHandle<T, F>),
+    Dropped,
 }
 impl<T, F> Future for DistIterReduceHandle<T, F>
 where
@@ -372,7 +268,8 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     type Output = Option<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Barrier(barrier, inner) => {
@@ -390,43 +287,7 @@ where
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
-        }
-    }
-}
-
-//#[doc(hidden)]
-impl<T, F> LamellarRequest for DistIterReduceHandle<T, F>
-where
-    T: Dist + ArrayOps,
-    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
-{
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Barrier(barrier, reqs) => {
-                barrier.blocking_wait();
-                self.team.block_on(reqs).blocking_wait()
-            }
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Barrier(barrier, _) => {
-                if !barrier.ready_or_set_waker(waker) {
-                    return false;
-                }
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Barrier(_barrier, _reqs) => {
-                unreachable!("should never be in barrier state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
+            StateProj::Dropped => panic!("called `Future::poll()` on a dropped future."),
         }
     }
 }
@@ -468,54 +329,3 @@ where
         }
     }
 }
-
-// #[lamellar_impl::AmLocalDataRT(Clone)]
-// pub(crate) struct ReduceAsyncAm<I, T, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Future<Output = T> + Send + 'static,
-//     T: Dist + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     pub(crate) op: F,
-//     pub(crate) iter: ReduceAsync<I, T, F>,
-//     pub(crate) schedule: IterSchedule,
-// }
-
-// impl<I: IterClone, T, F: Clone> IterClone for ReduceAsyncAm<I, T, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Future<Output = T> + Send + 'static,
-//     T: Dist + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     fn iter_clone(&self, _: Sealed) -> Self {
-//         ReduceAsyncAm {
-//             op: self.op.clone(),
-//             iter: self.iter.iter_clone(Sealed),
-//             schedule: self.schedule.clone(),
-//         }
-//     }
-// }
-
-// #[lamellar_impl::rt_am_local]
-// impl<I, T, F> LamellarAm for ReduceAsyncAm<I, T, F>
-// where
-//     I: DistributedIterator + 'static,
-//     I::Item: Future<Output = T> + Send + 'static,
-//     T: Dist + Send + ArrayOps,
-//     F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,
-// {
-//     async fn exec(&self) -> Option<T> {
-//         let mut iter = self.schedule.init_iter(self.iter.iter_clone(Sealed));
-//         match iter.next() {
-//             Some(mut accum) => {
-//                 while let Some(elem) = iter.next() {
-//                     accum = (self.op)(accum, elem);
-//                 }
-//                 Some(accum)
-//             }
-//             None => None,
-//         }
-//     }
-// }
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index 15df895c..89bcb812 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -12,11 +12,11 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 use crate::Dist;
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Sum<I> {
@@ -63,6 +63,7 @@ where
             reqs,
             team,
             state: InnerState::ReqsPending(None),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -76,6 +77,7 @@ pub(crate) struct InnerDistIterSumHandle<T> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
     state: InnerState<T>,
+    spawned: bool,
 }
 
 enum InnerState<T> {
@@ -125,7 +127,13 @@ where
     T: Dist + ArrayOps + std::iter::Sum,
 {
     type Output = T;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(local_sum) => {
@@ -163,45 +171,59 @@ where
     }
 }
 //#[doc(hidden)]
-impl<T> LamellarRequest for InnerDistIterSumHandle<T>
-where
-    T: Dist + ArrayOps + std::iter::Sum,
-{
-    fn blocking_wait(mut self) -> Self::Output {
-        let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
-        let local_sum = self
-            .reqs
-            .drain(..)
-            .map(|req| req.blocking_wait())
-            .into_iter()
-            .sum();
-        self.reduce_remote_vals(local_sum, local_sums)
-    }
+// impl<T> LamellarRequest for InnerDistIterSumHandle<T>
+// where
+//     T: Dist + ArrayOps + std::iter::Sum,
+// {
+//     fn blocking_wait(mut self) -> Self::Output {
+//         let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
+//         let local_sum = self
+//             .reqs
+//             .drain(..)
+//             .map(|req| req.blocking_wait())
+//             .into_iter()
+//             .sum();
+//         self.reduce_remote_vals(local_sum, local_sums)
+//     }
 
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
+//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+//         for req in self.reqs.iter_mut() {
+//             if !req.ready_or_set_waker(waker) {
+//                 //only need to wait on the next unready req
+//                 return false;
+//             }
+//         }
+//         true
+//     }
 
-    fn val(&self) -> Self::Output {
-        let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
-        let local_sum = self.reqs.iter().map(|req| req.val()).into_iter().sum();
-        self.reduce_remote_vals(local_sum, local_sums)
-    }
-}
+//     fn val(&self) -> Self::Output {
+//         let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
+//         let local_sum = self.reqs.iter().map(|req| req.val()).into_iter().sum();
+//         self.reduce_remote_vals(local_sum, local_sums)
+//     }
+// }
 
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct DistIterSumHandle<T> {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State<T>,
 }
 
+#[pinned_drop]
+impl<T> PinnedDrop for DistIterSumHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a DistIterSumHandle").print();
+        }
+    }
+}
+
 impl<T> DistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
@@ -212,19 +234,21 @@ where
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Barrier(barrier_handle, inner),
         }
     }
 
     /// This method will block until the associated Sum operation completes and returns the result
-    pub fn block(self) -> T {
+    pub fn block(mut self) -> T {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "DistIterSumHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+        self.array.clone().block_on(self)
     }
 
     /// This method will spawn the associated Sum Operation on the work queue,
@@ -232,8 +256,9 @@ where
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<T> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<T> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -244,13 +269,15 @@ enum State<T> {
         Pin<Box<dyn Future<Output = InnerDistIterSumHandle<T>> + Send>>,
     ),
     Reqs(#[pin] InnerDistIterSumHandle<T>),
+    Dropped,
 }
 impl<T> Future for DistIterSumHandle<T>
 where
     T: Dist + ArrayOps + std::iter::Sum,
 {
     type Output = T;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Barrier(barrier, inner) => {
@@ -268,45 +295,52 @@ where
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
+            StateProj::Dropped => panic!("called `Future::poll()` on a future that was dropped"),
         }
     }
 }
 
 //#[doc(hidden)]
-impl<T> LamellarRequest for DistIterSumHandle<T>
-where
-    T: Dist + ArrayOps + std::iter::Sum,
-{
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Barrier(barrier, reqs) => {
-                barrier.blocking_wait();
-                self.team.block_on(reqs).blocking_wait()
-            }
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Barrier(barrier, _) => {
-                if !barrier.ready_or_set_waker(waker) {
-                    return false;
-                }
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Barrier(_barrier, _reqs) => {
-                unreachable!("should never be in barrier state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
-        }
-    }
-}
+// impl<T> LamellarRequest for DistIterSumHandle<T>
+// where
+//     T: Dist + ArrayOps + std::iter::Sum,
+// {
+//     fn blocking_wait(mut self) -> Self::Output {
+//         self.launched = true;
+//         let state = std::mem::replace(&mut self.state, State::Dropped);
+//         match state {
+//             State::Barrier(barrier, reqs) => {
+//                 barrier.blocking_wait();
+//                 self.team.block_on(reqs).blocking_wait()
+//             }
+//             State::Reqs(inner) => inner.blocking_wait(),
+//             State::Dropped => panic!("called `blocking_wait` on a future that was dropped"),
+//         }
+//     }
+//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+//         self.launched = true;
+//         match &mut self.state {
+//             State::Barrier(barrier, _) => {
+//                 if !barrier.ready_or_set_waker(waker) {
+//                     return false;
+//                 }
+//                 waker.wake_by_ref();
+//                 false
+//             }
+//             State::Reqs(inner) => inner.ready_or_set_waker(waker),
+//             State::Dropped => panic!("called `ready_or_set_waker` on a future that was dropped"),
+//         }
+//     }
+//     fn val(&self) -> Self::Output {
+//         match &self.state {
+//             State::Barrier(_barrier, _reqs) => {
+//                 unreachable!("should never be in barrier state when val is called");
+//             }
+//             State::Reqs(inner) => inner.val(),
+//             State::Dropped => panic!("called `val` on a future that was dropped"),
+//         }
+//     }
+// }
 
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct SumAm<I> {
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index f5111d8f..761f325d 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -14,11 +14,11 @@ use crate::warnings::RuntimeWarning;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Collect<I, A> {
@@ -74,6 +74,7 @@ where
             distribution: self.distribution,
             team,
             state: InnerState::ReqsPending(Vec::new()),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -135,6 +136,7 @@ where
             distribution: self.distribution,
             team,
             state: InnerState::ReqsPending(Vec::new()),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -164,6 +166,7 @@ pub(crate) struct InnerLocalIterCollectHandle<T, A> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Vec<(usize, T)>>>,
     pub(crate) distribution: Distribution,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    spawned: bool,
     state: InnerState<T, A>,
 }
 
@@ -193,7 +196,13 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
     for InnerLocalIterCollectHandle<T, A>
 {
     type Output = A;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(vals) => {
@@ -230,48 +239,27 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
     }
 }
 
-impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static>
-    LamellarRequest for InnerLocalIterCollectHandle<T, A>
-{
-    fn blocking_wait(mut self) -> Self::Output {
-        // let mut num_local_vals = 0;
-        let mut temp_vals = vec![];
-        for req in self.reqs.drain(0..) {
-            let v = req.blocking_wait();
-            temp_vals.extend(v);
-        }
-        temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
-        let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(local_vals)
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-    fn val(&self) -> Self::Output {
-        // let mut num_local_vals = 0;
-        let mut temp_vals = vec![];
-        for req in self.reqs.iter() {
-            let v = req.val();
-            temp_vals.extend(v);
-        }
-        temp_vals.sort_by(|a, b| a.0.cmp(&b.0));
-        let local_vals = temp_vals.into_iter().map(|v| v.1).collect();
-        self.create_array(local_vals)
-    }
-}
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct LocalIterCollectHandle<T, A> {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State<T, A>,
 }
 
+#[pinned_drop]
+impl<T, A> PinnedDrop for LocalIterCollectHandle<T, A> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a DistIterSumHandle").print();
+        }
+    }
+}
+
 impl<T, A> LocalIterCollectHandle<T, A>
 where
     T: Dist + ArrayOps,
@@ -282,19 +270,21 @@ where
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Init(inner),
         }
     }
 
     /// This method will block until the associated Collect operation completes and returns the result
-    pub fn block(self) -> A {
+    pub fn block(mut self) -> A {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "LocalIterCollectHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+        self.array.clone().block_on(self)
     }
 
     /// This method will spawn the associated Collect Operation on the work queue,
@@ -302,8 +292,9 @@ where
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<A> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<A> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -311,6 +302,7 @@ where
 enum State<T, A> {
     Init(Pin<Box<dyn Future<Output = InnerLocalIterCollectHandle<T, A>> + Send>>),
     Reqs(#[pin] InnerLocalIterCollectHandle<T, A>),
+    Dropped,
 }
 impl<T, A> Future for LocalIterCollectHandle<T, A>
 where
@@ -318,7 +310,8 @@ where
     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
 {
     type Output = A;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Init(inner) => {
@@ -335,37 +328,7 @@ where
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
-        }
-    }
-}
-
-//#[doc(hidden)]
-impl<T, A> LamellarRequest for LocalIterCollectHandle<T, A>
-where
-    T: Dist + ArrayOps,
-    A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
-{
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Init(_) => {
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Init(_reqs) => {
-                unreachable!("should never be in init state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
+            StateProj::Dropped => panic!("called `Future::poll()` on a dropped future."),
         }
     }
 }
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index c8be3627..6fdc1275 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -9,11 +9,11 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Count<I> {
@@ -58,6 +58,7 @@ where
         InnerLocalIterCountHandle {
             reqs,
             state: InnerState::ReqsPending(0),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -70,6 +71,7 @@ where
 pub(crate) struct InnerLocalIterCountHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<usize>>,
     state: InnerState,
+    spawned: bool,
 }
 
 enum InnerState {
@@ -78,7 +80,13 @@ enum InnerState {
 
 impl Future for InnerLocalIterCountHandle {
     type Output = usize;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(cnt) => {
@@ -95,59 +103,48 @@ impl Future for InnerLocalIterCountHandle {
     }
 }
 
-//#[doc(hidden)]
-impl LamellarRequest for InnerLocalIterCountHandle {
-    fn blocking_wait(mut self) -> Self::Output {
-        self.reqs
-            .drain(..)
-            .map(|req| req.blocking_wait())
-            .into_iter()
-            .sum::<usize>()
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-    fn val(&self) -> Self::Output {
-        self.reqs
-            .iter()
-            .map(|req| req.val())
-            .into_iter()
-            .sum::<usize>()
-    }
-}
-
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct LocalIterCountHandle {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State,
 }
 
+#[pinned_drop]
+impl PinnedDrop for LocalIterCountHandle {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a LocalIterCountHandle").print();
+        }
+    }
+}
+
 impl LocalIterCountHandle {
     pub(crate) fn new(
         inner: Pin<Box<dyn Future<Output = InnerLocalIterCountHandle> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Init(inner),
         }
     }
 
     /// This method will block until the associated Count operation completes and returns the result
-    pub fn block(self) -> usize {
+    pub fn block(mut self) -> usize {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "LocalIterCountHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+        self.array.clone().block_on(self)
     }
 
     /// This method will spawn the associated Count Operation on the work queue,
@@ -155,8 +152,9 @@ impl LocalIterCountHandle {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<usize> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<usize> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -164,10 +162,12 @@ impl LocalIterCountHandle {
 enum State {
     Init(Pin<Box<dyn Future<Output = InnerLocalIterCountHandle> + Send>>),
     Reqs(#[pin] InnerLocalIterCountHandle),
+    Dropped,
 }
 impl Future for LocalIterCountHandle {
     type Output = usize;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Init(inner) => {
@@ -184,36 +184,11 @@ impl Future for LocalIterCountHandle {
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
+            StateProj::Dropped => panic!("called `Future::poll()` on a dropped future."),
         }
     }
 }
 
-//#[doc(hidden)]
-impl LamellarRequest for LocalIterCountHandle {
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Init(_reqs) => {
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Init(_reqs) => {
-                unreachable!("should never be in init state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
-        }
-    }
-}
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct CountAm<I> {
     pub(crate) iter: Count<I>,
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index 88e860c1..d0eae122 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -10,11 +10,11 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct ForEach<I, F>
@@ -72,7 +72,10 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        InnerLocalIterForEachHandle { reqs }
+        InnerLocalIterForEachHandle {
+            reqs,
+            spawned: false,
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -137,7 +140,10 @@ where
         _team: Pin<Arc<LamellarTeamRT>>,
         reqs: VecDeque<TaskGroupLocalAmHandle<Self::AmOutput>>,
     ) -> Self::Handle {
-        InnerLocalIterForEachHandle { reqs }
+        InnerLocalIterForEachHandle {
+            reqs,
+            spawned: false,
+        }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
         self.iter.elems(in_elems)
@@ -161,11 +167,18 @@ where
 //#[doc(hidden)]
 pub(crate) struct InnerLocalIterForEachHandle {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
+    spawned: bool,
 }
 
 impl Future for InnerLocalIterForEachHandle {
     type Output = ();
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         while let Some(mut req) = self.reqs.pop_front() {
             if !req.ready_or_set_waker(cx.waker()) {
                 self.reqs.push_front(req);
@@ -177,56 +190,48 @@ impl Future for InnerLocalIterForEachHandle {
 }
 
 //#[doc(hidden)]
-impl LamellarRequest for InnerLocalIterForEachHandle {
-    fn blocking_wait(mut self) -> Self::Output {
-        for req in self.reqs.drain(..) {
-            req.blocking_wait();
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-    fn val(&self) -> Self::Output {
-        for req in self.reqs.iter() {
-            req.val();
-        }
-    }
-}
-
-//#[doc(hidden)]
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct LocalIterForEachHandle {
-    // pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<()>>,
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State,
 }
 
+#[pinned_drop]
+impl PinnedDrop for LocalIterForEachHandle {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a LocalIterForEachHandle").print();
+        }
+    }
+}
+
 impl LocalIterForEachHandle {
     pub(crate) fn new(
         reqs: Pin<Box<dyn Future<Output = InnerLocalIterForEachHandle> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
         LocalIterForEachHandle {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Init(reqs),
         }
     }
 
     /// This method will block until the associated For Each operation completes and returns the result
-    pub fn block(self) {
+    pub fn block(mut self) {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "LocalIterForEachHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self);
+        self.array.clone().block_on(self);
     }
 
     /// This method will spawn the associated For Each Operation on the work queue,
@@ -234,8 +239,9 @@ impl LocalIterForEachHandle {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<()> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<()> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -243,10 +249,12 @@ impl LocalIterForEachHandle {
 enum State {
     Init(Pin<Box<dyn Future<Output = InnerLocalIterForEachHandle> + Send>>),
     Reqs(#[pin] InnerLocalIterForEachHandle),
+    Dropped,
 }
 impl Future for LocalIterForEachHandle {
     type Output = ();
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Init(inner) => {
@@ -263,37 +271,7 @@ impl Future for LocalIterForEachHandle {
                 ready!(inner.poll(cx));
                 Poll::Ready(())
             }
-        }
-    }
-}
-
-//#[doc(hidden)]
-impl LamellarRequest for LocalIterForEachHandle {
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Init(reqs) => {
-                self.team.block_on(reqs).blocking_wait();
-            }
-            State::Reqs(inner) => {
-                inner.blocking_wait();
-            }
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Init(_reqs) => {
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Init(_reqs) => {
-                unreachable!("should never be in barrier state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
+            StateProj::Dropped => panic!("called `Future::poll()` on a dropped future."),
         }
     }
 }
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index 34af94f9..8f41f5ff 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -10,11 +10,11 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Reduce<I, F> {
@@ -66,6 +66,7 @@ where
             op: self.op,
             reqs,
             state: InnerState::ReqsPending(None),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -79,6 +80,7 @@ pub(crate) struct InnerLocalIterReduceHandle<T, F> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<Option<T>>>,
     pub(crate) op: F,
     state: InnerState<T>,
+    spawned: bool,
 }
 
 enum InnerState<T> {
@@ -91,7 +93,13 @@ where
     F: Fn(T, T) -> T + SyncSend + 'static,
 {
     type Output = Option<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(val) => {
@@ -115,43 +123,27 @@ where
     }
 }
 
-//#[doc(hidden)]
-impl<T, F> LamellarRequest for InnerLocalIterReduceHandle<T, F>
-where
-    T: SyncSend + Copy + 'static,
-    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
-{
-    fn blocking_wait(mut self) -> Self::Output {
-        self.reqs
-            .drain(..)
-            .filter_map(|req| req.blocking_wait())
-            .reduce(self.op)
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
-
-    fn val(&self) -> Self::Output {
-        self.reqs
-            .iter()
-            .filter_map(|req| req.val())
-            .reduce(self.op.clone())
-    }
-}
-
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct LocalIterReduceHandle<T, F> {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State<T, F>,
 }
 
+#[pinned_drop]
+impl<T, F> PinnedDrop for LocalIterReduceHandle<T, F> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a LocalIterReduceHandle").print();
+        }
+    }
+}
+
 impl<T, F> LocalIterReduceHandle<T, F>
 where
     T: SyncSend + Copy + 'static,
@@ -162,19 +154,22 @@ where
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Init(reqs),
         }
     }
 
     /// This method will block until the associated Reduce operation completes and returns the result
-    pub fn block(self) -> Option<T> {
+    pub fn block(mut self) -> Option<T> {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "LocalIterReduceHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+
+        self.array.clone().block_on(self)
     }
 
     /// This method will spawn the associated Reduce Operation on the work queue,
@@ -183,8 +178,9 @@ where
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
 
-    pub fn spawn(self) -> LamellarTask<Option<T>> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<Option<T>> {
+        self.launched = true;
+        self.array.clone().spawn(self)
     }
 }
 
@@ -192,6 +188,7 @@ where
 enum State<T, F> {
     Init(Pin<Box<dyn Future<Output = InnerLocalIterReduceHandle<T, F>> + Send>>),
     Reqs(#[pin] InnerLocalIterReduceHandle<T, F>),
+    Dropped,
 }
 impl<T, F> Future for LocalIterReduceHandle<T, F>
 where
@@ -199,7 +196,8 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     type Output = Option<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Init(inner) => {
@@ -216,37 +214,7 @@ where
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
-        }
-    }
-}
-
-//#[doc(hidden)]
-impl<T, F> LamellarRequest for LocalIterReduceHandle<T, F>
-where
-    T: SyncSend + Copy + 'static,
-    F: Fn(T, T) -> T + SyncSend + Clone + 'static,
-{
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Init(_) => {
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Init(_reqs) => {
-                unreachable!("should never be in init state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
+            StateProj::Dropped => panic!("called `Future::poll()` on a dropped future."),
         }
     }
 }
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index 40f2906a..6f5ed5a6 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -10,11 +10,11 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::collections::VecDeque;
 use std::pin::Pin;
 use std::sync::Arc;
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 #[derive(Clone, Debug)]
 pub(crate) struct Sum<I> {
@@ -60,6 +60,7 @@ where
         InnerLocalIterSumHandle {
             reqs,
             state: InnerState::ReqsPending(None),
+            spawned: false,
         }
     }
     fn max_elems(&self, in_elems: usize) -> usize {
@@ -72,6 +73,7 @@ where
 pub(crate) struct InnerLocalIterSumHandle<T> {
     pub(crate) reqs: VecDeque<TaskGroupLocalAmHandle<T>>,
     state: InnerState<T>,
+    spawned: bool,
 }
 
 enum InnerState<T> {
@@ -83,7 +85,13 @@ where
     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     type Output = T;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        if !self.spawned {
+            for req in self.reqs.iter_mut() {
+                req.ready_or_set_waker(cx.waker());
+            }
+            self.spawned = true;
+        }
         let mut this = self.project();
         match &mut this.state {
             InnerState::ReqsPending(local_sum) => {
@@ -109,39 +117,53 @@ where
 }
 
 //#[doc(hidden)]
-impl<T> LamellarRequest for InnerLocalIterSumHandle<T>
-where
-    T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
-{
-    fn blocking_wait(mut self) -> Self::Output {
-        self.reqs
-            .drain(..)
-            .map(|req| req.blocking_wait())
-            .sum::<Self::Output>()
-    }
+// impl<T> LamellarRequest for InnerLocalIterSumHandle<T>
+// where
+//     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
+// {
+//     fn blocking_wait(mut self) -> Self::Output {
+//         self.reqs
+//             .drain(..)
+//             .map(|req| req.blocking_wait())
+//             .sum::<Self::Output>()
+//     }
 
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        for req in self.reqs.iter_mut() {
-            if !req.ready_or_set_waker(waker) {
-                //only need to wait on the next unready req
-                return false;
-            }
-        }
-        true
-    }
+//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+//         for req in self.reqs.iter_mut() {
+//             if !req.ready_or_set_waker(waker) {
+//                 //only need to wait on the next unready req
+//                 return false;
+//             }
+//         }
+//         true
+//     }
 
-    fn val(&self) -> Self::Output {
-        self.reqs.iter().map(|req| req.val()).sum::<Self::Output>()
-    }
-}
+//     fn val(&self) -> Self::Output {
+//         self.reqs.iter().map(|req| req.val()).sum::<Self::Output>()
+//     }
+// }
 
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct LocalIterSumHandle<T> {
-    team: Pin<Arc<LamellarTeamRT>>,
+    array: UnsafeArrayInner,
+    launched: bool,
     #[pin]
     state: State<T>,
 }
 
+#[pinned_drop]
+impl<T> PinnedDrop for LocalIterSumHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a LocalIterSumHandle").print();
+        }
+    }
+}
+
 impl<T> LocalIterSumHandle<T>
 where
     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
@@ -151,27 +173,31 @@ where
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
-            team: array.data.team.clone(),
+            array: array.clone(),
+            launched: false,
             state: State::Init(inner),
         }
     }
 
     /// This method will block until the associated Sumoperation completes and returns the result
-    pub fn block(self) -> T {
+    pub fn block(mut self) -> T {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "LocalIterSumHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.team.clone().block_on(self)
+        self.array.clone().block_on(self)
     }
     /// This method will spawn the associated Sum Operation on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<T> {
-        self.team.clone().scheduler.spawn_task(self)
+    pub fn spawn(mut self) -> LamellarTask<T> {
+        self.launched = true;
+
+        self.array.clone().spawn(self)
     }
 }
 
@@ -179,13 +205,15 @@ where
 enum State<T> {
     Init(Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>),
     Reqs(#[pin] InnerLocalIterSumHandle<T>),
+    Dropped,
 }
 impl<T> Future for LocalIterSumHandle<T>
 where
     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     type Output = T;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
             StateProj::Init(inner) => {
@@ -202,39 +230,46 @@ where
                 let val = ready!(inner.poll(cx));
                 Poll::Ready(val)
             }
+            StateProj::Dropped => panic!("called `Future::poll()` on a dropped future."),
         }
     }
 }
 
 //#[doc(hidden)]
-impl<T> LamellarRequest for LocalIterSumHandle<T>
-where
-    T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
-{
-    fn blocking_wait(self) -> Self::Output {
-        match self.state {
-            State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
-            State::Reqs(inner) => inner.blocking_wait(),
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        match &mut self.state {
-            State::Init(_) => {
-                waker.wake_by_ref();
-                false
-            }
-            State::Reqs(inner) => inner.ready_or_set_waker(waker),
-        }
-    }
-    fn val(&self) -> Self::Output {
-        match &self.state {
-            State::Init(_reqs) => {
-                unreachable!("should never be in init state when val is called");
-            }
-            State::Reqs(inner) => inner.val(),
-        }
-    }
-}
+// impl<T> LamellarRequest for LocalIterSumHandle<T>
+// where
+//     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
+// {
+//     fn blocking_wait(mut self) -> Self::Output {
+//         self.launched = true;
+//         let state = std::mem::replace(&mut self.state, State::Dropped);
+//         match state {
+//             State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
+//             State::Reqs(inner) => inner.blocking_wait(),
+//             State::Dropped => panic!("called `blocking_wait` on a future that was dropped"),
+//         }
+//     }
+//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+//         self.launched = true;
+//         match &mut self.state {
+//             State::Init(_) => {
+//                 waker.wake_by_ref();
+//                 false
+//             }
+//             State::Reqs(inner) => inner.ready_or_set_waker(waker),
+//             State::Dropped => panic!("called `ready_or_set_waker` on a future that was dropped"),
+//         }
+//     }
+//     fn val(&self) -> Self::Output {
+//         match &self.state {
+//             State::Init(_reqs) => {
+//                 unreachable!("should never be in init state when val is called");
+//             }
+//             State::Reqs(inner) => inner.val(),
+//             State::Dropped => panic!("called `val` on a future that was dropped"),
+//         }
+//     }
+// }
 
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct SumAm<I> {
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 3efdb517..891ee6e8 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -20,6 +20,7 @@ use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
 
 // use parking_lot::{
 //     lock_api::{ArcRwLockReadGuard, ArcRwLockWriteGuard},
@@ -966,6 +967,7 @@ impl<T: Dist + std::fmt::Debug> ArrayPrint<T> for LocalLockArray<T> {
 }
 
 //#[doc(hidden)]
+// Dropped Handle Warning triggered by AmHandle
 #[pin_project]
 pub struct LocalLockArrayReduceHandle<T: Dist + AmDist> {
     req: AmHandle<Option<T>>,
@@ -984,6 +986,11 @@ impl<T: Dist + AmDist> LocalLockArrayReduceHandle<T> {
 
     /// This method will block the caller until the associated Array Reduce Operation completesRuntimeWarning::BlockingCall("LocalLockArrayReduceHandle::block", "<handle>.spawn() or <handle>.await").print();
     pub fn block(self) -> Option<T> {
+        RuntimeWarning::BlockingCall(
+            "LocalLockArrayReduceHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
         self.lock_guard.array.clone().block_on(self)
     }
 }
diff --git a/src/array/local_lock_atomic/handle.rs b/src/array/local_lock_atomic/handle.rs
index e593b228..e863964d 100644
--- a/src/array/local_lock_atomic/handle.rs
+++ b/src/array/local_lock_atomic/handle.rs
@@ -17,7 +17,7 @@ use super::{
 };
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by LocalRwDarcReadHandle
 /// Handle used to retrieve the aquired read lock of a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -90,7 +90,7 @@ impl<T: Dist> LocalLockReadHandle<T> {
     /// let guard = task.block();
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<LocalLockReadGuard<T>> {
+    pub fn spawn(self) -> LamellarTask<LocalLockReadGuard<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -110,7 +110,7 @@ impl<T: Dist> Future for LocalLockReadHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by LocalRwDarcReadHandle
 /// Handle used to retrieve the aquired local data [LocalLockLocalData] of  a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -183,7 +183,7 @@ impl<T: Dist> LocalLockLocalDataHandle<T> {
     /// println!("local data: {:?}",local_data);
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<LocalLockLocalData<T>> {
+    pub fn spawn(self) -> LamellarTask<LocalLockLocalData<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -206,7 +206,7 @@ impl<T: Dist> Future for LocalLockLocalDataHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by LocalRwDarcWriteHandle
 /// Handle used to retrieve the aquired write lock of a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -279,7 +279,7 @@ impl<T: Dist> LocalLockWriteHandle<T> {
     /// task.block();
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<LocalLockWriteGuard<T>> {
+    pub fn spawn(self) -> LamellarTask<LocalLockWriteGuard<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -299,7 +299,7 @@ impl<T: Dist> Future for LocalLockWriteHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] // unused drop warning triggered by LocalRwDarcWriteHandle
 /// Handle used to retrieve the aquired mutable local data [LocalLockMutLocalData] of  a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -373,7 +373,7 @@ impl<T: Dist> LocalLockMutLocalDataHandle<T> {
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<LocalLockMutLocalData<T>> {
+    pub fn spawn(self) -> LamellarTask<LocalLockMutLocalData<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -396,7 +396,7 @@ impl<T: Dist> Future for LocalLockMutLocalDataHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by LocalRwDarcReadHandle
 /// Constructs a handle for immutably iterating over fixed sized chunks(slices) of the local data of this array.
 /// This handle must be either await'd in an async context or block'd in an non-async context.
 /// Awaiting or blocking will not return until the read lock has been acquired.
@@ -476,7 +476,7 @@ impl<T: Dist> LocalLockLocalChunksHandle<T> {
     /// iter_task.block();
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<LocalLockLocalChunks<T>> {
+    pub fn spawn(self) -> LamellarTask<LocalLockLocalChunks<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
@@ -499,7 +499,7 @@ impl<T: Dist> Future for LocalLockLocalChunksHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] // unused drop warning triggered by LocalRwDarcWriteHandle
 /// A handle for mutably iterating over fixed sized chunks(slices) of the local data of this array.
 /// This handle must be either await'd in an async context or block'd in an non-async context.
 /// Awaiting or blocking will not return until the write lock has been acquired.
@@ -583,7 +583,7 @@ impl<T: Dist> LocalLockLocalChunksMutHandle<T> {
     /// iter_task.block();
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<LocalLockLocalChunksMut<T>> {
+    pub fn spawn(self) -> LamellarTask<LocalLockLocalChunksMut<T>> {
         self.array.lock.darc.team().spawn(self)
     }
 }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index f57826b1..4ac62d9d 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -19,6 +19,7 @@ impl<T> InnerArray for LocalLockArray<T> {
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
     data: LocalLockArray<T>,
+    // lock: Arc<LocalRwDarcReadGuard<()>>,
     lock: Arc<LocalRwDarcReadGuard<()>>,
     cur_i: usize,
     end_i: usize,
@@ -351,8 +352,9 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
 
     fn dist_iter(&self) -> Self::DistIter {
         // let the_array: LocalLockArray<T> = self.clone();
-        let lock: LocalRwDarc<()> = self.lock.clone();
-        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
+        // let lock: LocalRwDarc<()> = self.lock.clone();
+        // let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
+        let lock = Arc::new(self.lock.read().block());
         self.barrier();
         LocalLockDistIter {
             data: self.clone(),
@@ -364,8 +366,9 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     }
 
     fn local_iter(&self) -> Self::LocalIter {
-        let lock: LocalRwDarc<()> = self.lock.clone();
-        let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
+        // let lock: LocalRwDarc<()> = self.lock.clone();
+        // let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
+        let lock = Arc::new(self.lock.read().block());
         LocalLockLocalIter {
             data: self.clone(),
             lock: lock,
@@ -393,8 +396,9 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     type LocalIter = LocalLockLocalIterMut<'static, T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock: LocalRwDarc<()> = self.lock.clone();
-        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
+        // let lock: LocalRwDarc<()> = self.lock.clone();
+        // let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
+        let lock = Arc::new(self.lock.write().block());
         // self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         LocalLockDistIterMut {
@@ -408,8 +412,9 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
 
     fn local_iter_mut(&self) -> Self::LocalIter {
         // println!("trying to get write lock for iter");
-        let lock: LocalRwDarc<()> = self.lock.clone();
-        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
+        // let lock: LocalRwDarc<()> = self.lock.clone();
+        // let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
+        let lock = Arc::new(self.lock.write().block());
         // println!("got write lock for iter");
         LocalLockLocalIterMut {
             data: self.clone(),
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index c6b44edf..796cb170 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -23,6 +23,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
@@ -36,6 +37,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
             array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
+            spawned: false,
         }
     }
 }
@@ -51,6 +53,7 @@ impl<T: Dist> LamellarArrayGet<T> for LocalLockArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
@@ -73,6 +76,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for LocalLockArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
 }
@@ -88,6 +92,7 @@ impl<T: Dist> LamellarArrayPut<T> for LocalLockArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
@@ -169,7 +174,7 @@ impl LamellarAm for LocalLockRemoteGetAm {
     //because we need to guarantee the put operation is atomic (maybe iput would work?)
     async fn exec(self) -> Vec<u8> {
         // println!("in remotegetam {:?} {:?}",self.start_index,self.len);
-        let _lock = self.array.lock.read();
+        let _lock = self.array.lock.read().await;
         unsafe {
             match self
                 .array
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index 499ae56f..e2dc2a98 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -20,6 +20,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
@@ -33,6 +34,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
             array: self.as_lamellar_byte_array(),
             req: Some(req),
             buf: buf,
+            spawned: false,
         }
     }
 }
@@ -47,6 +49,7 @@ impl<T: Dist> LamellarArrayGet<T> for NativeAtomicArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
@@ -69,6 +72,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for NativeAtomicArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: VecDeque::from([req.into()]),
+            spawned: false,
         }
     }
 }
@@ -84,6 +88,7 @@ impl<T: Dist> LamellarArrayPut<T> for NativeAtomicArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
diff --git a/src/array/operations/handle.rs b/src/array/operations/handle.rs
index db4a49e0..4d4fefdf 100644
--- a/src/array/operations/handle.rs
+++ b/src/array/operations/handle.rs
@@ -1,22 +1,51 @@
 use crate::{
-    array::{AmDist, LamellarByteArray}, lamellar_request::LamellarRequest, scheduler::LamellarTask, warnings::RuntimeWarning, AmHandle
+    array::{AmDist, LamellarByteArray},
+    lamellar_request::LamellarRequest,
+    scheduler::LamellarTask,
+    warnings::RuntimeWarning,
+    AmHandle,
 };
 
 use std::{
     collections::VecDeque,
     future::Future,
     pin::Pin,
-    task::{Context, Poll, Waker},
+    task::{Context, Poll},
 };
 
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 
 /// a task handle for a batched array operation that doesnt return any values
 #[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayBatchOpHandle {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
-    pub(crate) reqs: VecDeque<(AmHandle<()>, Vec<usize>)>,
+    pub(crate) state: BatchOpState,
 }
+
+pub(crate) enum BatchOpState {
+    Reqs(VecDeque<(AmHandle<()>, Vec<usize>)>),
+    Launched(VecDeque<(LamellarTask<()>, Vec<usize>)>),
+}
+
+impl Drop for ArrayBatchOpHandle {
+    // fn drop(&mut self) {
+    //     if self.reqs.len() > 0 {
+    //         RuntimeWarning::disable_warnings();
+    //         for _ in self.reqs.drain(0..) {}
+    //         RuntimeWarning::enable_warnings();
+    //         RuntimeWarning::DroppedHandle("an ArrayBatchOpHandle").print();
+    //     }
+    // }
+    fn drop(&mut self) {
+        if let BatchOpState::Reqs(reqs) = &mut self.state {
+            RuntimeWarning::disable_warnings();
+            for _ in reqs.drain(0..) {}
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("an ArrayBatchOpHandle").print();
+        }
+    }
+}
+
 /// a task handle for a single array operation that doesnt return any values
 pub type ArrayOpHandle = ArrayBatchOpHandle;
 
@@ -26,36 +55,40 @@ impl ArrayBatchOpHandle {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<()> {
-        self.array.team().spawn(self)
+    pub fn spawn(mut self) -> LamellarTask<()> {
+        // let mut old_state =
+        //     std::mem::replace(&mut self.state, BatchOpState::Launched(VecDeque::new()));
+        match &mut self.state {
+            BatchOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<()>, Vec<usize>)>>();
+                self.state = BatchOpState::Launched(launched);
+                self.array.team().spawn(self)
+            }
+            _ => panic!("ArrayBatchOpHandle should already have been spawned"),
+        }
     }
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> () {
+    pub fn block(mut self) -> () {
         RuntimeWarning::BlockingCall(
             "ArrayBatchOpHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.array.team().block_on(self)
-    }
-}
-
-impl LamellarRequest for ArrayBatchOpHandle {
-    fn blocking_wait(mut self) -> Self::Output {
-        for req in self.reqs.drain(0..) {
-            req.0.blocking_wait();
-        }
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        let mut ready = true;
-        for req in self.reqs.iter_mut() {
-            ready &= req.0.ready_or_set_waker(waker);
-        }
-        ready
-    }
-    fn val(&self) -> Self::Output {
-        for req in self.reqs.iter() {
-            req.0.val();
+        // let mut old_state =
+        //     std::mem::replace(&mut self.state, BatchOpState::Launched(VecDeque::new()));
+        match &mut self.state {
+            BatchOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<()>, Vec<usize>)>>();
+                self.state = BatchOpState::Launched(launched);
+                self.array.team().block_on(self)
+            }
+            _ => panic!("ArrayBatchOpHandle should already have been blocked on"),
         }
     }
 }
@@ -63,11 +96,24 @@ impl LamellarRequest for ArrayBatchOpHandle {
 impl Future for ArrayBatchOpHandle {
     type Output = ();
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        while let Some(mut req) = self.reqs.pop_front() {
-            if !req.0.ready_or_set_waker(cx.waker()) {
-                self.reqs.push_front(req);
+        match &mut self.state {
+            BatchOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<()>, Vec<usize>)>>();
+                self.state = BatchOpState::Launched(launched);
+                cx.waker().wake_by_ref();
                 return Poll::Pending;
             }
+            BatchOpState::Launched(reqs) => {
+                while let Some(mut req) = reqs.pop_front() {
+                    if Future::poll(Pin::new(&mut req.0), cx).is_pending() {
+                        reqs.push_front(req);
+                        return Poll::Pending;
+                    }
+                }
+            }
         }
         Poll::Ready(())
     }
@@ -76,8 +122,15 @@ impl Future for ArrayBatchOpHandle {
 /// a task handle for a single array operation that returns a value
 #[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayFetchOpHandle<R: AmDist> {
+    //AmHandle triggers Handle Dropped warning
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
-    pub(crate) req: AmHandle<Vec<R>>,
+    pub(crate) state: FetchOpState<R>,
+    // pub(crate) req: AmHandle<Vec<R>>,
+}
+
+pub(crate) enum FetchOpState<R> {
+    Req(AmHandle<Vec<R>>),
+    Launched(LamellarTask<Vec<R>>),
 }
 
 impl<R: AmDist> ArrayFetchOpHandle<R> {
@@ -86,82 +139,136 @@ impl<R: AmDist> ArrayFetchOpHandle<R> {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<R> {
-        self.array.team().spawn(self)
+    pub fn spawn(mut self) -> LamellarTask<R> {
+        match self.state {
+            FetchOpState::Req(req) => {
+                self.state = FetchOpState::Launched(req.spawn());
+                self.array.team().spawn(self)
+            }
+            _ => panic!("ArrayBatchOpHandle should already have been spawned"),
+        }
     }
 
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> R {
+    pub fn block(mut self) -> R {
         RuntimeWarning::BlockingCall(
             "ArrayFetchOpHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.array.team().block_on(self)
-    }
-}
-
-impl<R: AmDist> LamellarRequest for ArrayFetchOpHandle<R> {
-    fn blocking_wait(self) -> Self::Output {
-        self.req
-            .blocking_wait()
-            .pop()
-            .expect("should have a single request")
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        self.req.ready_or_set_waker(waker)
-    }
-    fn val(&self) -> Self::Output {
-        self.req.val().pop().expect("should have a single request")
+        match self.state {
+            FetchOpState::Req(req) => {
+                self.state = FetchOpState::Launched(req.spawn());
+                self.array.team().block_on(self)
+            }
+            _ => panic!("ArrayBatchOpHandle should already have been blocked_on"),
+        }
     }
 }
 
 impl<R: AmDist> Future for ArrayFetchOpHandle<R> {
     type Output = R;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        if self.req.ready_or_set_waker(cx.waker()) {
-            return Poll::Ready(self.req.val().pop().expect("should have a single request"));
+        match &mut self.state {
+            FetchOpState::Req(req) => {
+                if req.ready_or_set_waker(cx.waker()) {
+                    return Poll::Ready(req.val().pop().expect("should have a single request"));
+                }
+            }
+            FetchOpState::Launched(req) => {
+                if let Poll::Ready(mut res) = Future::poll(Pin::new(req), cx) {
+                    return Poll::Ready(res.pop().expect("should have a single request"));
+                }
+            }
         }
+        //
         Poll::Pending
     }
 }
 
 /// a task handle for a batched array operation that return values
-#[pin_project]
+#[pin_project(PinnedDrop)]
 #[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayFetchBatchOpHandle<R: AmDist> {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
-    pub(crate) reqs: VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>,
+    // pub(crate) reqs: VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>,
+    pub(crate) state: FetchBatchOpState<R>,
     results: Vec<R>,
 }
 
+enum FetchBatchOpState<R> {
+    Reqs(VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>),
+    Launched(VecDeque<(LamellarTask<Vec<R>>, Vec<usize>)>),
+}
+
+#[pinned_drop]
+impl<R: AmDist> PinnedDrop for ArrayFetchBatchOpHandle<R> {
+    fn drop(self: Pin<&mut Self>) {
+        let mut this = self.project();
+        if let FetchBatchOpState::Reqs(reqs) = &mut this.state {
+            RuntimeWarning::disable_warnings();
+            for _ in reqs.drain(0..) {}
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("an ArrayFetchBatchOpHandle").print();
+        }
+    }
+}
+
 impl<R: AmDist> ArrayFetchBatchOpHandle<R> {
     /// This method will spawn the associated Array Operation on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<Vec<R>> {
-        self.array.team().spawn(self)
+    pub fn spawn(mut self) -> LamellarTask<Vec<R>> {
+        match &mut self.state {
+            FetchBatchOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<Vec<R>>, Vec<usize>)>>();
+                self.state = FetchBatchOpState::Launched(launched);
+                self.array.team().spawn(self)
+            }
+            _ => panic!("ArrayFetchBatchOpHandle should already have been spawned"),
+        }
     }
 
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> Vec<R> {
+    pub fn block(mut self) -> Vec<R> {
         RuntimeWarning::BlockingCall(
             "ArrayFetchBatchOpHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.array.team().block_on(self)
+        match &mut self.state {
+            FetchBatchOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<Vec<R>>, Vec<usize>)>>();
+                self.state = FetchBatchOpState::Launched(launched);
+                self.array.team().block_on(self)
+            }
+            _ => panic!("ArrayBatchOpHandle should already have been blocked on"),
+        }
     }
 }
 
 impl<R: AmDist> From<ArrayFetchBatchOpHandle<R>> for ArrayFetchOpHandle<R> {
     fn from(mut req: ArrayFetchBatchOpHandle<R>) -> Self {
-        Self {
-            array: req.array,
-            req: req.reqs.pop_front().unwrap().0,
-        }
+        let handle = match &mut req.state {
+            FetchBatchOpState::Reqs(reqs) => Self {
+                array: req.array.clone(),
+                state: FetchOpState::Req(reqs.pop_front().unwrap().0),
+            },
+            FetchBatchOpState::Launched(reqs) => Self {
+                array: req.array.clone(),
+                state: FetchOpState::Launched(reqs.pop_front().unwrap().0),
+            },
+        };
+        req.state = FetchBatchOpState::Launched(VecDeque::new());
+        handle
     }
 }
 
@@ -177,56 +284,39 @@ impl<R: AmDist> ArrayFetchBatchOpHandle<R> {
         }
         Self {
             array: array,
-            reqs,
+            state: FetchBatchOpState::Reqs(reqs),
             results,
         }
     }
 }
 
-impl<R: AmDist> LamellarRequest for ArrayFetchBatchOpHandle<R> {
-    fn blocking_wait(mut self) -> Self::Output {
-        for req in self.reqs.drain(0..) {
-            let mut res = req.0.blocking_wait();
-            for (val, idx) in res.drain(..).zip(req.1.iter()) {
-                self.results[*idx] = val;
-            }
-        }
-        std::mem::take(&mut self.results)
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        let mut ready = true;
-        for req in self.reqs.iter_mut() {
-            ready &= req.0.ready_or_set_waker(waker);
-        }
-        ready
-    }
-    fn val(&self) -> Self::Output {
-        let mut results = Vec::with_capacity(self.results.len());
-        unsafe {
-            results.set_len(self.results.len());
-        }
-        for req in &self.reqs {
-            let mut res = req.0.val();
-            for (val, idx) in res.drain(..).zip(req.1.iter()) {
-                results[*idx] = val;
-            }
-        }
-        results
-    }
-}
-
 impl<R: AmDist> Future for ArrayFetchBatchOpHandle<R> {
     type Output = Vec<R>;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
-        while let Some(mut req) = this.reqs.pop_front() {
-            if !req.0.ready_or_set_waker(cx.waker()) {
-                this.reqs.push_front(req);
+        match &mut this.state {
+            FetchBatchOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<Vec<R>>, Vec<usize>)>>();
+                *this.state = FetchBatchOpState::Launched(launched);
+                cx.waker().wake_by_ref();
                 return Poll::Pending;
-            } else {
-                let mut res = req.0.val();
-                for (val, idx) in res.drain(..).zip(req.1.iter()) {
-                    this.results[*idx] = val;
+            }
+            FetchBatchOpState::Launched(reqs) => {
+                while let Some(mut req) = reqs.pop_front() {
+                    match Future::poll(Pin::new(&mut req.0), cx) {
+                        Poll::Pending => {
+                            reqs.push_front(req);
+                            return Poll::Pending;
+                        }
+                        Poll::Ready(mut res) => {
+                            for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                                this.results[*idx] = val;
+                            }
+                        }
+                    }
                 }
             }
         }
@@ -237,8 +327,14 @@ impl<R: AmDist> Future for ArrayFetchBatchOpHandle<R> {
 /// a task handle for a single array operation that returns a result
 #[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayResultOpHandle<R: AmDist> {
+    // dropped handle triggered by AmHandle
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
-    pub(crate) req: AmHandle<Vec<Result<R, R>>>,
+    pub(crate) state: ResultOpState<R>,
+}
+
+pub(crate) enum ResultOpState<R: AmDist> {
+    Req(AmHandle<Vec<Result<R, R>>>),
+    Launched(LamellarTask<Vec<Result<R, R>>>),
 }
 
 impl<R: AmDist> ArrayResultOpHandle<R> {
@@ -247,82 +343,134 @@ impl<R: AmDist> ArrayResultOpHandle<R> {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<Result<R, R>> {
-        self.array.team().spawn(self)
+    pub fn spawn(mut self) -> LamellarTask<Result<R, R>> {
+        match self.state {
+            ResultOpState::Req(req) => {
+                self.state = ResultOpState::Launched(req.spawn());
+                self.array.team().spawn(self)
+            }
+            _ => panic!("ArrayResultOpHandle should already have been spawned"),
+        }
     }
 
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> Result<R, R> {
+    pub fn block(mut self) -> Result<R, R> {
         RuntimeWarning::BlockingCall(
             "ArrayResultOpHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.array.team().block_on(self)
-    }
-}
-
-impl<R: AmDist> LamellarRequest for ArrayResultOpHandle<R> {
-    fn blocking_wait(self) -> Self::Output {
-        self.req
-            .blocking_wait()
-            .pop()
-            .expect("should have a single request")
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        self.req.ready_or_set_waker(waker)
-    }
-    fn val(&self) -> Self::Output {
-        self.req.val().pop().expect("should have a single request")
+        match self.state {
+            ResultOpState::Req(req) => {
+                self.state = ResultOpState::Launched(req.spawn());
+                self.array.team().block_on(self)
+            }
+            _ => panic!("ArrayResultOpHandle should already have been spawned"),
+        }
     }
 }
 
 impl<R: AmDist> Future for ArrayResultOpHandle<R> {
     type Output = Result<R, R>;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        if self.req.ready_or_set_waker(cx.waker()) {
-            return Poll::Ready(self.req.val().pop().expect("should have a single request"));
+        match &mut self.state {
+            ResultOpState::Req(req) => {
+                if req.ready_or_set_waker(cx.waker()) {
+                    return Poll::Ready(req.val().pop().expect("should have a single request"));
+                }
+            }
+            ResultOpState::Launched(req) => {
+                if let Poll::Ready(mut res) = Future::poll(Pin::new(req), cx) {
+                    return Poll::Ready(res.pop().expect("should have a single request"));
+                }
+            }
         }
         Poll::Pending
     }
 }
 
 /// a task handle for a batched array operation that returns results
-#[pin_project]
+#[pin_project(PinnedDrop)]
 #[must_use = "Array operation handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called. Ignoring the resulting value with 'let _ = ...' will cause the operation to NOT BE executed."]
 pub struct ArrayResultBatchOpHandle<R: AmDist> {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
-    pub(crate) reqs: VecDeque<(AmHandle<Vec<Result<R, R>>>, Vec<usize>)>,
+    pub(crate) state: BatchResultOpState<R>, //reqs: ,
     results: Vec<Result<R, R>>,
 }
 
+pub(crate) enum BatchResultOpState<R> {
+    Reqs(VecDeque<(AmHandle<Vec<Result<R, R>>>, Vec<usize>)>),
+    Launched(VecDeque<(LamellarTask<Vec<Result<R, R>>>, Vec<usize>)>),
+}
+
+#[pinned_drop]
+impl<R: AmDist> PinnedDrop for ArrayResultBatchOpHandle<R> {
+    fn drop(self: Pin<&mut Self>) {
+        let mut this = self.project();
+        if let BatchResultOpState::Reqs(reqs) = &mut this.state {
+            RuntimeWarning::disable_warnings();
+            for _ in reqs.drain(0..) {}
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("an ArrayResultBatchOpHandle").print();
+        }
+    }
+}
+
 impl<R: AmDist> ArrayResultBatchOpHandle<R> {
     /// This method will spawn the associated Array Operation on the work queue,
     /// initiating the remote operation.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<Vec<Result<R, R>>> {
-        self.array.team().spawn(self)
+    pub fn spawn(mut self) -> LamellarTask<Vec<Result<R, R>>> {
+        match &mut self.state {
+            BatchResultOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<Vec<Result<R, R>>>, Vec<usize>)>>();
+                self.state = BatchResultOpState::Launched(launched);
+                self.array.team().spawn(self)
+            }
+            _ => panic!("ArrayFetchBatchOpHandle should already have been spawned"),
+        }
     }
 
     /// This method will block the calling thread until the associated Array Operation completes
-    pub fn block(self) -> Vec<Result<R, R>> {
+    pub fn block(mut self) -> Vec<Result<R, R>> {
         RuntimeWarning::BlockingCall(
             "ArrayResultBatchOpHandle::block",
             "<handle>.spawn() or <handle>.await",
         )
         .print();
-        self.array.team().block_on(self)
+        match &mut self.state {
+            BatchResultOpState::Reqs(reqs) => {
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<Vec<Result<R, R>>>, Vec<usize>)>>();
+                self.state = BatchResultOpState::Launched(launched);
+                self.array.team().block_on(self)
+            }
+            _ => panic!("ArrayBatchOpHandle should already have been blocked on"),
+        }
     }
 }
 
 impl<R: AmDist> From<ArrayResultBatchOpHandle<R>> for ArrayResultOpHandle<R> {
     fn from(mut req: ArrayResultBatchOpHandle<R>) -> Self {
-        Self {
-            array: req.array,
-            req: req.reqs.pop_front().unwrap().0,
-        }
+        let handle = match &mut req.state {
+            BatchResultOpState::Reqs(reqs) => Self {
+                array: req.array.clone(),
+                state: ResultOpState::Req(reqs.pop_front().unwrap().0),
+            },
+            BatchResultOpState::Launched(reqs) => Self {
+                array: req.array.clone(),
+                state: ResultOpState::Launched(reqs.pop_front().unwrap().0),
+            },
+        };
+        req.state = BatchResultOpState::Launched(VecDeque::new());
+        handle
     }
 }
 
@@ -338,56 +486,42 @@ impl<R: AmDist> ArrayResultBatchOpHandle<R> {
         }
         Self {
             array: array,
-            reqs,
+            state: BatchResultOpState::Reqs(reqs),
             results,
         }
     }
 }
 
-impl<R: AmDist> LamellarRequest for ArrayResultBatchOpHandle<R> {
-    fn blocking_wait(mut self) -> Self::Output {
-        for req in self.reqs.drain(0..) {
-            let mut res = req.0.blocking_wait();
-            for (val, idx) in res.drain(..).zip(req.1.iter()) {
-                self.results[*idx] = val;
-            }
-        }
-        std::mem::take(&mut self.results)
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        let mut ready = true;
-        for req in self.reqs.iter_mut() {
-            ready &= req.0.ready_or_set_waker(waker);
-        }
-        ready
-    }
-    fn val(&self) -> Self::Output {
-        let mut results = Vec::with_capacity(self.results.len());
-        unsafe {
-            results.set_len(self.results.len());
-        }
-        for req in &self.reqs {
-            let mut res = req.0.val();
-            for (val, idx) in res.drain(..).zip(req.1.iter()) {
-                results[*idx] = val;
-            }
-        }
-        results
-    }
-}
-
 impl<R: AmDist> Future for ArrayResultBatchOpHandle<R> {
     type Output = Vec<Result<R, R>>;
     fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         let mut this = self.project();
-        while let Some(mut req) = this.reqs.pop_front() {
-            if !req.0.ready_or_set_waker(cx.waker()) {
-                this.reqs.push_front(req);
+        match &mut this.state {
+            BatchResultOpState::Reqs(reqs) => {
+                // println!("launching sub ams");
+                let launched = reqs
+                    .drain(..)
+                    .map(|(am, res)| (am.spawn(), res))
+                    .collect::<VecDeque<(LamellarTask<Vec<Result<R, R>>>, Vec<usize>)>>();
+                *this.state = BatchResultOpState::Launched(launched);
+                cx.waker().wake_by_ref();
                 return Poll::Pending;
-            } else {
-                let mut res = req.0.val();
-                for (val, idx) in res.drain(..).zip(req.1.iter()) {
-                    this.results[*idx] = val;
+            }
+            BatchResultOpState::Launched(reqs) => {
+                // println!("polling sub ams");
+                while let Some(mut req) = reqs.pop_front() {
+                    match Future::poll(Pin::new(&mut req.0), cx) {
+                        Poll::Pending => {
+                            reqs.push_front(req);
+                            return Poll::Pending;
+                        }
+                        Poll::Ready(mut res) => {
+                            // println!("res: {:?}", res.len());
+                            for (val, idx) in res.drain(..).zip(req.1.iter()) {
+                                this.results[*idx] = val;
+                            }
+                        }
+                    }
                 }
             }
         }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 581aacd6..4c550946 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -535,7 +535,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
                 .launched_req_cnt
                 .load(Ordering::SeqCst)
         {
-            RuntimeWarning::UnspanedTask(
+            RuntimeWarning::UnspawnedTask(
                 "`await_all` on an array before all operations, iterators, etc, created by the array have been spawned",
             )
             .print();
@@ -1045,25 +1045,25 @@ impl<T: Dist> ActiveMessaging for UnsafeArray<T> {
             .exec_am_local_tg(am, Some(self.team_counters()))
     }
     fn wait_all(&self) {
-        if self
-            .inner
-            .data
-            .array_counters
-            .send_req_cnt
-            .load(Ordering::SeqCst)
-            != self
-                .inner
-                .data
-                .array_counters
-                .launched_req_cnt
-                .load(Ordering::SeqCst)
-        {
-            RuntimeWarning::UnspanedTask(
-                "`wait_all` on an array before all operations, iterators, etc, created by the array have been spawned",
-            )
-            .print();
-        }
         let mut temp_now = Instant::now();
+        // println!(
+        //     "in array wait_all  cnt: {:?} {:?} {:?}",
+        //     self.inner
+        //         .data
+        //         .array_counters
+        //         .send_req_cnt
+        //         .load(Ordering::SeqCst),
+        //     self.inner
+        //         .data
+        //         .array_counters
+        //         .outstanding_reqs
+        //         .load(Ordering::SeqCst),
+        //     self.inner
+        //         .data
+        //         .array_counters
+        //         .launched_req_cnt
+        //         .load(Ordering::SeqCst)
+        // );
         // let mut first = true;
         while self
             .inner
@@ -1098,6 +1098,42 @@ impl<T: Dist> ActiveMessaging for UnsafeArray<T> {
                 // first = false;
             }
         }
+        if self
+            .inner
+            .data
+            .array_counters
+            .send_req_cnt
+            .load(Ordering::SeqCst)
+            != self
+                .inner
+                .data
+                .array_counters
+                .launched_req_cnt
+                .load(Ordering::SeqCst)
+        {
+            println!(
+                "in array wait_all  cnt: {:?} {:?} {:?}",
+                self.inner
+                    .data
+                    .array_counters
+                    .send_req_cnt
+                    .load(Ordering::SeqCst),
+                self.inner
+                    .data
+                    .array_counters
+                    .outstanding_reqs
+                    .load(Ordering::SeqCst),
+                self.inner
+                    .data
+                    .array_counters
+                    .launched_req_cnt
+                    .load(Ordering::SeqCst)
+            );
+            RuntimeWarning::UnspawnedTask(
+                "`wait_all` on an array before all operations, iterators, etc, created by the array have been spawned",
+            )
+            .print();
+        }
         self.inner.data.task_group.wait_all();
     }
     fn await_all(&self) -> impl Future<Output = ()> + Send {
@@ -1114,7 +1150,14 @@ impl<T: Dist> ActiveMessaging for UnsafeArray<T> {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        self.inner.data.team.scheduler.spawn_task(f)
+        self.inner.data.team.scheduler.spawn_task(
+            f,
+            vec![
+                self.inner.data.team.world_counters.clone(),
+                self.inner.data.team.team_counters.clone(),
+                self.inner.data.array_counters.clone(),
+            ],
+        )
     }
     fn block_on<F: Future>(&self, f: F) -> F::Output {
         self.inner.data.team.scheduler.block_on(f)
@@ -1572,6 +1615,23 @@ impl UnsafeArrayInnerWeak {
 }
 
 impl UnsafeArrayInner {
+    pub(crate) fn spawn<F: Future>(&self, f: F) -> LamellarTask<F::Output>
+    where
+        F: Future + Send + 'static,
+        F::Output: Send,
+    {
+        self.data.team.scheduler.spawn_task(
+            f,
+            vec![
+                self.data.team.world_counters.clone(),
+                self.data.team.team_counters.clone(),
+                self.data.array_counters.clone(),
+            ],
+        )
+    }
+    pub(crate) fn block_on<F: Future>(&self, f: F) -> F::Output {
+        self.data.team.scheduler.block_on(f)
+    }
     pub(crate) fn downgrade(array: &UnsafeArrayInner) -> UnsafeArrayInnerWeak {
         UnsafeArrayInnerWeak {
             data: Darc::downgrade(&array.data),
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index f691fab7..73022949 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -38,13 +38,15 @@ macro_rules! consumer_impl {
             {
                 let am = $($am)*;
                 let inner = self.clone();
-                let reqs_future = Box::pin(async move{match sched {
+                let reqs_future = Box::pin(async move{
+                    match sched {
                     Schedule::Static => inner.sched_static(am),
                     Schedule::Dynamic => inner.sched_dynamic(am),
                     Schedule::Chunk(size) => inner.sched_chunk(am,size),
                     Schedule::Guided => inner.sched_guided(am),
                     Schedule::WorkStealing => inner.sched_work_stealing(am),
-                }});
+                }
+            });
                 $return_type::new(reqs_future,self)
             }
         }
diff --git a/src/array/unsafe/operations.rs b/src/array/unsafe/operations.rs
index 4e41cf6e..47f3b398 100644
--- a/src/array/unsafe/operations.rs
+++ b/src/array/unsafe/operations.rs
@@ -348,7 +348,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         };
         ArrayBatchOpHandle {
             array: byte_array,
-            reqs: res,
+            state: BatchOpState::Reqs(res),
         }
     }
 
@@ -600,7 +600,7 @@ impl<T: AmDist + Dist + 'static> UnsafeArray<T> {
         let num_per_batch =
             (config().am_size_threshold as f32 / std::mem::size_of::<T>() as f32).ceil() as usize;
 
-        // println!("multi_val_one_index");
+        println!("multi_val_one_index");
         let cnt = Arc::new(AtomicUsize::new(0));
         let futures = Arc::new(Mutex::new(VecDeque::new()));
         let (pe, local_index) = match self.pe_and_offset_for_global_index(index) {
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 0baf9825..b67aa44b 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -633,6 +633,7 @@ impl<T: Dist> UnsafeArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
@@ -644,6 +645,7 @@ impl<T: Dist> UnsafeArray<T> {
             array: self.as_lamellar_byte_array(),
             req: None,
             buf: buf,
+            spawned: false,
         }
     }
 
@@ -736,6 +738,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for UnsafeArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: reqs,
+            spawned: false,
         }
     }
 
@@ -757,6 +760,7 @@ impl<T: Dist> LamellarArrayInternalPut<T> for UnsafeArray<T> {
         ArrayRdmaHandle {
             array: self.as_lamellar_byte_array(),
             reqs: reqs,
+            spawned: false,
         }
     }
 }
@@ -772,6 +776,7 @@ impl<T: Dist> LamellarArrayPut<T> for UnsafeArray<T> {
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
                 reqs: VecDeque::new(),
+                spawned: false,
             },
         }
     }
diff --git a/src/barrier.rs b/src/barrier.rs
index 6aa157fe..f9dcdf81 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -7,7 +7,7 @@ use crate::scheduler::Scheduler;
 use crate::warnings::RuntimeWarning;
 
 use futures_util::Future;
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 use std::pin::Pin;
 use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
@@ -308,6 +308,7 @@ impl Barrier {
             num_rounds: self.num_rounds,
             n: self.n,
             state: State::RoundInit(self.num_rounds),
+            launched: false,
         };
         // println!("in barrier handle");
         // self.print_bar();
@@ -348,73 +349,6 @@ impl Barrier {
     pub(crate) async fn async_barrier(&self) {
         self.barrier_handle().await;
     }
-
-    //     pub(crate) async fn async_barrier(&self) {
-    //         let mut s = Instant::now();
-    //         if self.panic.load(Ordering::SeqCst) == 0 {
-    //             if let Some(send_buf) = &self.send_buf {
-    //                 if let Ok(my_index) = self.arch.team_pe(self.my_pe) {
-    //                     let send_buf_slice = unsafe {
-    //                         // im the only thread (remote or local) that can write to this buff
-    //                         send_buf.as_mut_slice().expect("Data should exist on PE")
-    //                     };
-
-    //                     let barrier_id = self.barrier_cnt.fetch_add(1, Ordering::SeqCst);
-    //                     send_buf_slice[0] = barrier_id;
-    //                     let barrier_slice = &[barrier_id];
-
-    //                     for round in 0..self.num_rounds {
-    //                         for i in 1..=self.n {
-    //                             let team_send_pe =
-    //                                 (my_index + i * (self.n + 1).pow(round as u32)) % self.num_pes;
-    //                             if team_send_pe != my_index {
-    //                                 let send_pe = self.arch.single_iter(team_send_pe).next().unwrap();
-    //                                 unsafe {
-    //                                     self.barrier_buf[i - 1].put_slice(
-    //                                         send_pe,
-    //                                         round,
-    //                                         barrier_slice,
-    //                                     );
-    //                                     //safe as we are the only ones writing to our index
-    //                                 }
-    //                             }
-    //                         }
-    //                         for i in 1..=self.n {
-    //                             let team_recv_pe = ((my_index as isize
-    //                                 - (i as isize * (self.n as isize + 1).pow(round as u32) as isize))
-    //                                 as isize)
-    //                                 .rem_euclid(self.num_pes as isize)
-    //                                 as isize;
-    //                             let recv_pe =
-    //                                 self.arch.single_iter(team_recv_pe as usize).next().unwrap();
-    //                             if team_recv_pe as usize != my_index {
-    //                                 unsafe {
-    //                                     //safe as  each pe is only capable of writing to its own index
-    //                                     while self.barrier_buf[i - 1]
-    //                                         .as_mut_slice()
-    //                                         .expect("Data should exist on PE")[round]
-    //                                         < barrier_id
-    //                                     {
-    //                                         self.barrier_timeout(
-    //                                             &mut s,
-    //                                             my_index,
-    //                                             round,
-    //                                             i,
-    //                                             team_recv_pe,
-    //                                             recv_pe,
-    //                                             send_buf_slice,
-    //                                         );
-    //                                         self.lamellae.flush();
-    //                                         async_std::task::yield_now().await;
-    //                                     }
-    //                                 }
-    //                             }
-    //                         }
-    //                     }
-    //                 }
-    //             }
-    //         }
-    //     }
 }
 
 // impl Drop for Barrier {
@@ -426,7 +360,7 @@ impl Barrier {
 //     }
 // }
 
-#[pin_project]
+#[pin_project(PinnedDrop)]
 pub struct BarrierHandle {
     barrier_buf: Arc<Vec<MemoryRegion<usize>>>,
     arch: Arc<LamellarArchRT>,
@@ -438,6 +372,16 @@ pub struct BarrierHandle {
     num_rounds: usize,
     n: usize,
     state: State,
+    launched: bool,
+}
+
+#[pinned_drop]
+impl PinnedDrop for BarrierHandle {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a BarrierHandle").print();
+        }
+    }
 }
 
 enum State {
@@ -490,8 +434,9 @@ impl BarrierHandle {
 
 impl Future for BarrierHandle {
     type Output = ();
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         // let mut this = self.project();
+        self.launched = true;
         match self.state {
             State::Waiting => {
                 if self.barrier_id > self.cur_barrier_id.load(Ordering::SeqCst) {
@@ -553,7 +498,8 @@ impl Future for BarrierHandle {
 }
 
 impl LamellarRequest for BarrierHandle {
-    fn blocking_wait(self) -> Self::Output {
+    fn blocking_wait(mut self) -> Self::Output {
+        self.launched = true;
         match self.state {
             State::Waiting => {
                 while self.barrier_id > self.cur_barrier_id.load(Ordering::SeqCst) {
@@ -612,6 +558,7 @@ impl LamellarRequest for BarrierHandle {
     }
 
     fn ready_or_set_waker(&mut self, _waker: &Waker) -> bool {
+        self.launched = true;
         match self.state {
             State::Waiting => false,
             State::RoundInit(round) => {
diff --git a/src/darc.rs b/src/darc.rs
index 4d58026d..1848ec07 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1404,6 +1404,7 @@ impl<T> Darc<T> {
         IntoLocalRwDarcHandle {
             darc: self.into(),
             team,
+            launched: false,
             outstanding_future: Box::pin(async move {
                 DarcInner::block_on_outstanding(wrapped_inner, DarcMode::LocalRw, 0).await;
             }),
@@ -1439,6 +1440,7 @@ impl<T> Darc<T> {
         IntoGlobalRwDarcHandle {
             darc: self.into(),
             team,
+            launched: false,
             outstanding_future: Box::pin(async move {
                 DarcInner::block_on_outstanding(wrapped_inner, DarcMode::GlobalRw, 0).await;
             }),
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index 01346a73..a6074f4f 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -797,6 +797,7 @@ impl<T> GlobalRwDarc<T> {
         IntoDarcHandle {
             darc: self.into(),
             team,
+            launched: false,
             outstanding_future: Box::pin(async move {
                 DarcInner::block_on_outstanding(wrapped_inner, DarcMode::Darc, 0).await;
             }),
@@ -833,6 +834,7 @@ impl<T> GlobalRwDarc<T> {
         IntoLocalRwDarcHandle {
             darc: self.into(),
             team,
+            launched: false,
             outstanding_future: Box::pin(async move {
                 DarcInner::block_on_outstanding(wrapped_inner, DarcMode::LocalRw, 0).await;
             }),
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
index 129ee169..c89f96dd 100644
--- a/src/darc/handle.rs
+++ b/src/darc/handle.rs
@@ -13,7 +13,7 @@ use crate::{GlobalRwDarc, LamellarTeamRT};
 
 use async_lock::{RwLock, RwLockReadGuardArc, RwLockWriteGuardArc};
 use futures_util::{ready, Future};
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 
 use super::global_rw_darc::{
     DistRwLock, GlobalRwDarcCollectiveWriteGuard, GlobalRwDarcReadGuard, GlobalRwDarcWriteGuard,
@@ -26,10 +26,11 @@ enum State<T> {
     Init,
     TryingRead(#[pin] Pin<Box<dyn Future<Output = RwLockReadGuardArc<T>> + Send + 'static>>),
     TryingWrite(#[pin] Pin<Box<dyn Future<Output = RwLockWriteGuardArc<T>> + Send + 'static>>),
+    Dropped,
 }
 
 #[must_use = "LocalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project(PinnedDrop)]
 /// Handle used to retrieve the aquired read lock from a LocalRwDarc
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -72,14 +73,29 @@ enum State<T> {
 ///```
 pub struct LocalRwDarcReadHandle<T: 'static> {
     darc: LocalRwDarc<T>,
+    pub(crate) launched: bool,
     #[pin]
     state: State<T>,
 }
 
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for LocalRwDarcReadHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a LocalRwDarcReadHandle").print();
+        }
+    }
+}
+
 impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
     pub(crate) fn new(darc: LocalRwDarc<T>) -> Self {
         Self {
             darc,
+            launched: false,
             state: State::Init,
         }
     }
@@ -99,7 +115,8 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
     ///
     ///```
-    pub fn block(self) -> LocalRwDarcReadGuard<T> {
+    pub fn block(mut self) -> LocalRwDarcReadGuard<T> {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "LocalRwDarcReadHandle::block",
             "<handle>.spawn() or<handle>.await",
@@ -115,7 +132,7 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
             .clone()
             .block_on(async move { inner_darc.read_arc().await });
         LocalRwDarcReadGuard {
-            darc: self.darc,
+            darc: self.darc.clone(),
             lock: guard,
         }
     }
@@ -140,14 +157,16 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
     ///
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(self) -> LamellarTask<LocalRwDarcReadGuard<T>> {
+    pub fn spawn(mut self) -> LamellarTask<LocalRwDarcReadGuard<T>> {
+        self.launched = true;
         self.darc.darc.team().spawn(self)
     }
 }
 
 impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
     type Output = LocalRwDarcReadGuard<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let inner_darc = self.darc.darc.clone();
         let mut this = self.project();
         match this.state.as_mut().project() {
@@ -170,7 +189,7 @@ impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
 }
 
 #[must_use = "LocalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project(PinnedDrop)]
 /// Handle used to retrieve the aquired write lock from a LocalRwDarc
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -212,14 +231,29 @@ impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
 ///```
 pub struct LocalRwDarcWriteHandle<T: 'static> {
     darc: LocalRwDarc<T>,
+    pub(crate) launched: bool,
     #[pin]
     state: State<T>,
 }
 
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for LocalRwDarcWriteHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            let mut this = self.project();
+            RuntimeWarning::disable_warnings();
+            *this.state = State::Dropped;
+            RuntimeWarning::enable_warnings();
+            RuntimeWarning::DroppedHandle("a LocalRwDarcWriteHandle").print();
+        }
+    }
+}
+
 impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
     pub(crate) fn new(darc: LocalRwDarc<T>) -> Self {
         Self {
             darc,
+            launched: false,
             state: State::Init,
         }
     }
@@ -238,7 +272,8 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
     /// let mut guard = handle.block(); //block until we get the write lock
     /// *guard += my_pe;
     ///```
-    pub fn block(self) -> LocalRwDarcWriteGuard<T> {
+    pub fn block(mut self) -> LocalRwDarcWriteGuard<T> {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "LocalRwDarcWriteHandle::block",
             "<handle>.spawn() or<handle>.await",
@@ -254,7 +289,7 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
             .clone()
             .block_on(async move { inner_darc.write_arc().await });
         LocalRwDarcWriteGuard {
-            darc: self.darc,
+            darc: self.darc.clone(),
             lock: guard,
         }
     }
@@ -278,14 +313,16 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
     /// *guard += my_pe;
     ///```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(self) -> LamellarTask<LocalRwDarcWriteGuard<T>> {
+    pub fn spawn(mut self) -> LamellarTask<LocalRwDarcWriteGuard<T>> {
+        self.launched = true;
         self.darc.darc.team().spawn(self)
     }
 }
 
 impl<T: Sync + Send> Future for LocalRwDarcWriteHandle<T> {
     type Output = LocalRwDarcWriteGuard<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let inner_darc = self.darc.darc.clone();
         let mut this = self.project();
         match this.state.as_mut().project() {
@@ -308,7 +345,7 @@ impl<T: Sync + Send> Future for LocalRwDarcWriteHandle<T> {
 }
 
 #[must_use = "GlobalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by AmHandle
 /// Handle used to retrieve the aquired read lock from a GlobalRwDarc
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -424,7 +461,7 @@ impl<T: Sync + Send> Future for GlobalRwDarcReadHandle<T> {
 }
 
 #[must_use = "GlobalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by AmHandle
 /// Handle used to retrieve the aquired write lock from a GlobalRwDarc
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -538,7 +575,7 @@ impl<T: Sync + Send> Future for GlobalRwDarcWriteHandle<T> {
 }
 
 #[must_use = "GlobalRwDarc lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project] //unused drop warning triggered by AmHandle
 /// Handle used to retrieve the aquired collective write lock from a GlobalRwDarc
 ///
 /// This handle must be awaited or blocked on to actually acquire the lock
@@ -702,7 +739,7 @@ impl<T: 'static> OrigDarc<T> {
 }
 
 #[must_use = " Darc 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project(PinnedDrop)]
 #[doc(alias = "Collective")]
 /// This is a handle representing the operation of changing from a [LocalRwDarc] or [GlobalRwDarc] into a regular [Darc].
 /// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
@@ -731,10 +768,20 @@ impl<T: 'static> OrigDarc<T> {
 pub struct IntoDarcHandle<T: 'static> {
     pub(crate) darc: OrigDarc<T>,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
     #[pin]
     pub(crate) outstanding_future: Pin<Box<dyn Future<Output = ()> + Send>>,
 }
 
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for IntoDarcHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoDarcHandle").print();
+        }
+    }
+}
+
 impl<T: Sync + Send> IntoDarcHandle<T> {
     /// Used to drive to conversion of a [LocalRwDarc] or [GlobalRwDarc] into a [Darc]
     /// # Examples
@@ -745,7 +792,8 @@ impl<T: Sync + Send> IntoDarcHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_darc = five.into_darc().block();
-    pub fn block(self) -> Darc<T> {
+    pub fn block(mut self) -> Darc<T> {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "IntoDarcHandle::block",
             "<handle>.spawn() or <handle>.await",
@@ -768,14 +816,16 @@ impl<T: Sync + Send> IntoDarcHandle<T> {
     /// let five_as_darc = five_as_darc_task.block();
     /// ```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(self) -> LamellarTask<Darc<T>> {
+    pub fn spawn(mut self) -> LamellarTask<Darc<T>> {
+        self.launched = true;
         self.team.clone().spawn(self)
     }
 }
 
 impl<T: Sync + Send> Future for IntoDarcHandle<T> {
     type Output = Darc<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         ready!(this.outstanding_future.as_mut().poll(cx));
         this.darc.inc_local_cnt();
@@ -790,7 +840,7 @@ impl<T: Sync + Send> Future for IntoDarcHandle<T> {
 }
 
 #[must_use = " Darc 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project(PinnedDrop)]
 #[doc(alias = "Collective")]
 /// This is a handle representing the operation of changing from a [Darc] or [GlobalRwDarc] into a [LocalRwDarc].
 /// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
@@ -819,10 +869,20 @@ impl<T: Sync + Send> Future for IntoDarcHandle<T> {
 pub struct IntoLocalRwDarcHandle<T: 'static> {
     pub(crate) darc: OrigDarc<T>,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
     #[pin]
     pub(crate) outstanding_future: Pin<Box<dyn Future<Output = ()> + Send>>,
 }
 
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for IntoLocalRwDarcHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoLocalRwDarcHandle").print();
+        }
+    }
+}
+
 impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
     /// Used to drive to conversion of a [Darc] or [GlobalRwDarc] into a [LocalRwDarc]
     /// # Examples
@@ -833,7 +893,8 @@ impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_localrw = five.into_localrw().block();
-    pub fn block(self) -> LocalRwDarc<T> {
+    pub fn block(mut self) -> LocalRwDarc<T> {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "IntoLocalRwDarcHandle::block",
             "<handle>.spawn() or<handle>.await",
@@ -857,14 +918,16 @@ impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
     /// let five_as_localrw = five_as_localrw_task.block();
     /// ```
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(self) -> LamellarTask<LocalRwDarc<T>> {
+    pub fn spawn(mut self) -> LamellarTask<LocalRwDarc<T>> {
+        self.launched = true;
         self.team.clone().spawn(self)
     }
 }
 
 impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
     type Output = LocalRwDarc<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         ready!(this.outstanding_future.as_mut().poll(cx));
         this.darc.inc_local_cnt();
@@ -880,7 +943,7 @@ impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
 }
 
 #[must_use = " Darc 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project]
+#[pin_project(PinnedDrop)]
 #[doc(alias = "Collective")]
 /// This is a handle representing the operation of changing from a [Darc] or [LocalRwDarc] into a [GlobalRwDarc].
 /// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
@@ -909,10 +972,20 @@ impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
 pub struct IntoGlobalRwDarcHandle<T: 'static> {
     pub(crate) darc: OrigDarc<T>,
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
     #[pin]
     pub(crate) outstanding_future: Pin<Box<dyn Future<Output = ()> + Send>>,
 }
 
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for IntoGlobalRwDarcHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoGlobalRwDarcHandle").print();
+        }
+    }
+}
+
 impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
     /// Used to drive to conversion of a  [Darc] or [LocalRwDarc] into a [GlobalRwDarc]
     /// # Examples
@@ -923,7 +996,8 @@ impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
     /// let five_as_globalrw = five.into_globalrw().block();
-    pub fn block(self) -> GlobalRwDarc<T> {
+    pub fn block(mut self) -> GlobalRwDarc<T> {
+        self.launched = true;
         RuntimeWarning::BlockingCall(
             "IntoGlobalRwDarcHandle::block",
             "<handle>.spawn() or<handle>.await",
@@ -946,14 +1020,16 @@ impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
     /// let five_as_globalrw_task = five.into_globalrw().spawn();
     /// let five_as_globalrw = five_as_globalrw_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(self) -> LamellarTask<GlobalRwDarc<T>> {
+    pub fn spawn(mut self) -> LamellarTask<GlobalRwDarc<T>> {
+        self.launched = true;
         self.team.clone().spawn(self)
     }
 }
 
 impl<T: Sync + Send> Future for IntoGlobalRwDarcHandle<T> {
     type Output = GlobalRwDarc<T>;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
         let mut this = self.project();
         ready!(this.outstanding_future.as_mut().poll(cx));
         this.darc.inc_local_cnt();
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 16326798..e028a625 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -340,6 +340,7 @@ impl<T> LocalRwDarc<T> {
         IntoGlobalRwDarcHandle {
             darc: self.into(),
             team,
+            launched: false,
             outstanding_future: Box::pin(DarcInner::block_on_outstanding(
                 wrapped_inner,
                 DarcMode::GlobalRw,
@@ -380,6 +381,7 @@ impl<T: Send + Sync> LocalRwDarc<T> {
         IntoDarcHandle {
             darc: self.into(),
             team,
+            launched: false,
             outstanding_future: Box::pin(async move {
                 DarcInner::block_on_outstanding(wrapped_inner, DarcMode::Darc, 0).await;
             }),
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 4091bd76..635048ae 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -53,6 +53,9 @@ pub struct TaskGroupAmHandle<T: AmDist> {
 #[pinned_drop]
 impl<T: AmDist> PinnedDrop for TaskGroupAmHandle<T> {
     fn drop(self: Pin<&mut Self>) {
+        if self.am.is_some() {
+            RuntimeWarning::DroppedHandle("an TaskGroupAmHandle").print();
+        }
         self.inner.cnt.fetch_sub(1, Ordering::SeqCst);
     }
 }
@@ -142,7 +145,7 @@ impl<T: AmDist> TaskGroupAmHandle<T> {
     #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
     pub fn spawn(mut self) -> LamellarTask<T> {
         self.launch_am_if_needed();
-        self.inner.scheduler.clone().spawn_task(self)
+        self.inner.scheduler.clone().spawn_task(self, Vec::new()) //counters managed by AM
     }
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(mut self) -> T {
@@ -248,6 +251,9 @@ pub struct TaskGroupMultiAmHandle<T: AmDist> {
 #[pinned_drop]
 impl<T: AmDist> PinnedDrop for TaskGroupMultiAmHandle<T> {
     fn drop(self: Pin<&mut Self>) {
+        if self.am.is_some() {
+            RuntimeWarning::DroppedHandle("an TaskGroupMultiAmHandle").print();
+        }
         self.inner.cnt.fetch_sub(1, Ordering::SeqCst);
     }
 }
@@ -340,7 +346,7 @@ impl<T: AmDist> TaskGroupMultiAmHandle<T> {
     #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
     pub fn spawn(mut self) -> LamellarTask<Vec<T>> {
         self.launch_am_if_needed();
-        self.inner.scheduler.clone().spawn_task(self)
+        self.inner.scheduler.clone().spawn_task(self, Vec::new()) //counters managed by AM
     }
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(mut self) -> Vec<T> {
@@ -464,6 +470,9 @@ pub struct TaskGroupLocalAmHandle<T> {
 #[pinned_drop]
 impl<T> PinnedDrop for TaskGroupLocalAmHandle<T> {
     fn drop(self: Pin<&mut Self>) {
+        if self.am.is_some() {
+            RuntimeWarning::DroppedHandle("an TaskGroupLocalAmHandle").print();
+        }
         self.inner.cnt.fetch_sub(1, Ordering::SeqCst);
     }
 }
@@ -514,7 +523,7 @@ impl<T: Send + 'static> TaskGroupLocalAmHandle<T> {
     #[must_use = "this function returns a future used to poll for completion. If ignored/dropped the only way to ensure completion is calling 'wait_all()' on the world or array"]
     pub fn spawn(mut self) -> LamellarTask<T> {
         self.launch_am_if_needed();
-        self.inner.scheduler.clone().spawn_task(self)
+        self.inner.scheduler.clone().spawn_task(self, Vec::new()) //counters managed by AM
     }
     /// This method will block the calling thread until the associated Array Operation completes
     pub fn block(mut self) -> T {
@@ -708,7 +717,14 @@ impl ActiveMessaging for LamellarTaskGroup {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        self.team.scheduler.spawn_task(task)
+        self.team.scheduler.spawn_task(
+            task,
+            vec![
+                self.team.world_counters.clone(),
+                self.team.team_counters.clone(),
+                self.counters.clone(),
+            ],
+        )
     }
     fn block_on<F>(&self, f: F) -> F::Output
     where
@@ -724,10 +740,18 @@ impl ActiveMessaging for LamellarTaskGroup {
         <I as IntoIterator>::Item: Future + Send + 'static,
         <<I as IntoIterator>::Item as Future>::Output: Send,
     {
-        self.team.scheduler.block_on(join_all(
-            iter.into_iter()
-                .map(|task| self.team.scheduler.spawn_task(task)),
-        ))
+        self.team
+            .scheduler
+            .block_on(join_all(iter.into_iter().map(|task| {
+                self.team.scheduler.spawn_task(
+                    task,
+                    vec![
+                        self.team.world_counters.clone(),
+                        self.team.team_counters.clone(),
+                        self.counters.clone(),
+                    ],
+                )
+            })))
     }
 }
 
@@ -801,17 +825,13 @@ impl LamellarTaskGroup {
 
     fn wait_all(&self) {
         RuntimeWarning::BlockingCall("wait_all", "await_all().await").print();
-
-        if self.counters.send_req_cnt.load(Ordering::SeqCst)
-            != self.counters.launched_req_cnt.load(Ordering::SeqCst)
-            || self.counters.send_req_cnt.load(Ordering::SeqCst)
-                != self.counters.launched_req_cnt.load(Ordering::SeqCst)
-        {
-            RuntimeWarning::UnspanedTask(
-                        "`wait_all` on an active message group before all tasks/active messages create by the group have been spawned",
-                    )
-                    .print();
-        }
+        // println!(
+        //     "in task group wait_all mype: {:?} cnt: {:?} {:?} {:?}",
+        //     self.team.world_pe,
+        //     self.counters.send_req_cnt.load(Ordering::SeqCst),
+        //     self.counters.outstanding_reqs.load(Ordering::SeqCst),
+        //     self.counters.launched_req_cnt.load(Ordering::SeqCst)
+        // );
         let mut temp_now = Instant::now();
         while self.counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
             // self.team.flush();
@@ -835,6 +855,23 @@ impl LamellarTaskGroup {
                 temp_now = Instant::now();
             }
         }
+        if self.counters.send_req_cnt.load(Ordering::SeqCst)
+            != self.counters.launched_req_cnt.load(Ordering::SeqCst)
+            || self.counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.counters.launched_req_cnt.load(Ordering::SeqCst)
+        {
+            println!(
+                "in task group wait_all mype: {:?} cnt: {:?} {:?} {:?}",
+                self.team.world_pe,
+                self.counters.send_req_cnt.load(Ordering::SeqCst),
+                self.counters.outstanding_reqs.load(Ordering::SeqCst),
+                self.counters.launched_req_cnt.load(Ordering::SeqCst)
+            );
+            RuntimeWarning::UnspawnedTask(
+                        "`wait_all` on an active message group before all tasks/active messages create by the group have been spawned",
+                    )
+                    .print();
+        }
     }
 
     async fn await_all(&self) {
@@ -843,7 +880,7 @@ impl LamellarTaskGroup {
             || self.counters.send_req_cnt.load(Ordering::SeqCst)
                 != self.counters.launched_req_cnt.load(Ordering::SeqCst)
         {
-            RuntimeWarning::UnspanedTask(
+            RuntimeWarning::UnspawnedTask(
                         "`await_all` on an active message group before all tasks/active messages created by the group have been spawned",
                     )
                     .print();
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 12d80c46..ac75aba6 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -532,7 +532,13 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         F::Output: Send,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
-        self.team.scheduler.spawn_task(task)
+        self.team.scheduler.spawn_task(
+            task,
+            vec![
+                self.team.world_counters.clone(),
+                self.team.team_counters.clone(),
+            ],
+        )
     }
 
     fn block_on<F: Future>(&self, f: F) -> F::Output {
@@ -550,10 +556,17 @@ impl ActiveMessaging for Arc<LamellarTeam> {
         <<I as IntoIterator>::Item as Future>::Output: Send,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
-        self.team.scheduler.block_on(join_all(
-            iter.into_iter()
-                .map(|task| self.team.scheduler.spawn_task(task)),
-        ))
+        self.team
+            .scheduler
+            .block_on(join_all(iter.into_iter().map(|task| {
+                self.team.scheduler.spawn_task(
+                    task,
+                    vec![
+                        self.team.world_counters.clone(),
+                        self.team.team_counters.clone(),
+                    ],
+                )
+            })))
     }
 }
 
@@ -1366,7 +1379,10 @@ impl LamellarTeamRT {
         F::Output: Send,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
-        self.scheduler.spawn_task(task)
+        self.scheduler.spawn_task(
+            task,
+            vec![self.world_counters.clone(), self.team_counters.clone()],
+        )
     }
 
     //#[tracing::instrument(skip_all)]
@@ -1374,24 +1390,11 @@ impl LamellarTeamRT {
         // println!("wait_all called on pe: {}", self.world_pe);
 
         RuntimeWarning::BlockingCall("wait_all", "await_all().await").print();
-        if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
-            != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
-            || self.world_counters.send_req_cnt.load(Ordering::SeqCst)
-                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst)
-        {
-            RuntimeWarning::UnspanedTask(
-                "`wait_all` before all tasks/active messages have been spawned",
-            )
-            .print();
-            println!(
-                "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
-                self.world_pe,
-                self.team_counters.send_req_cnt.load(Ordering::SeqCst),
-                self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
-                self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
-            );
-        }
+
         let mut temp_now = Instant::now();
+        let mut orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
+        let mut orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
+
 
         // println!(
         //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
@@ -1400,10 +1403,14 @@ impl LamellarTeamRT {
         //     self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
         // );
         while self.panic.load(Ordering::SeqCst) == 0
-            && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+            && ((self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0 
+                || orig_reqs !=  self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+                || orig_launched !=  self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
                 || (self.parent.is_none()
                     && self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0))
         {
+            orig_reqs =  self.team_counters.send_req_cnt.load(Ordering::SeqCst);
+            orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
             // std::thread::yield_now();
             // self.flush();
             if std::thread::current().id() != *crate::MAIN_THREAD {
@@ -1418,6 +1425,24 @@ impl LamellarTeamRT {
                 );
                 temp_now = Instant::now();
             }
+            
+        }
+        if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+            != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
+            || (self.parent.is_none() && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
+        {
+            println!(
+                "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
+                self.world_pe,
+                self.team_counters.send_req_cnt.load(Ordering::SeqCst),
+                self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
+                self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
+            );
+            RuntimeWarning::UnspawnedTask(
+                "`wait_all` before all tasks/active messages have been spawned",
+            )
+            .print();
         }
         // println!(
         //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
@@ -1427,16 +1452,7 @@ impl LamellarTeamRT {
         // );
     }
     pub(crate) async fn await_all(&self) {
-        if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
-            != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
-            || self.world_counters.send_req_cnt.load(Ordering::SeqCst)
-                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst)
-        {
-            RuntimeWarning::UnspanedTask(
-                "`await_all` before all tasks/active messages have been spawned",
-            )
-            .print();
-        }
+        
         let mut temp_now = Instant::now();
         while self.panic.load(Ordering::SeqCst) == 0
             && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
@@ -1456,6 +1472,16 @@ impl LamellarTeamRT {
                 temp_now = Instant::now();
             }
         }
+        if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+            != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
+            || (self.parent.is_none() && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
+        {
+            RuntimeWarning::UnspawnedTask(
+                "`await_all` before all tasks/active messages have been spawned",
+            )
+            .print();
+        }
     }
 
     pub(crate) fn block_on<F>(&self, f: F) -> F::Output
@@ -1478,9 +1504,13 @@ impl LamellarTeamRT {
         <<I as IntoIterator>::Item as Future>::Output: Send,
     {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
-        self.scheduler.block_on(join_all(
-            iter.into_iter().map(|task| self.scheduler.spawn_task(task)),
-        ))
+        self.scheduler
+            .block_on(join_all(iter.into_iter().map(|task| {
+                self.scheduler.spawn_task(
+                    task,
+                    vec![self.world_counters.clone(), self.team_counters.clone()],
+                )
+            })))
     }
 
     //#[tracing::instrument(skip_all)]
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 19793512..6e27f809 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -91,7 +91,13 @@ impl ActiveMessaging for LamellarWorld {
         F: Future + Send + 'static,
         F::Output: Send,
     {
-        self.team_rt.scheduler.spawn_task(f)
+        self.team_rt.scheduler.spawn_task(
+            f,
+            vec![
+                self.team_rt.world_counters.clone(),
+                self.team_rt.team_counters.clone(),
+            ],
+        )
     }
 
     fn block_on<F>(&self, f: F) -> F::Output
@@ -110,10 +116,17 @@ impl ActiveMessaging for LamellarWorld {
         <<I as IntoIterator>::Item as Future>::Output: Send,
     {
         // trace_span!("block_on_all").in_scope(||
-        self.team_rt.scheduler.block_on(join_all(
-            iter.into_iter()
-                .map(|task| self.team_rt.scheduler.spawn_task(task)),
-        ))
+        self.team_rt
+            .scheduler
+            .block_on(join_all(iter.into_iter().map(|task| {
+                self.team_rt.scheduler.spawn_task(
+                    task,
+                    vec![
+                        self.team_rt.world_counters.clone(),
+                        self.team_rt.team_counters.clone(),
+                    ],
+                )
+            })))
         // )
     }
 }
@@ -322,6 +335,7 @@ impl Drop for LamellarWorld {
                 //     self._counters.send_req_cnt.load(Ordering::SeqCst),
                 //     self._counters.outstanding_reqs.load(Ordering::SeqCst),
                 // );
+                self.barrier();
                 self.wait_all();
                 // println!(
                 //     "in team destroy mype: {:?} cnt: {:?} {:?}",
diff --git a/src/scheduler.rs b/src/scheduler.rs
index 3d7bb1c5..c16ee1df 100755
--- a/src/scheduler.rs
+++ b/src/scheduler.rs
@@ -255,7 +255,13 @@ impl Scheduler {
         let am_future = async move {
             // let start_tid = thread::current().id();
 
-            // println!("[{:?}] submit work exec req {:?} {:?} TaskId: {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task);
+            // println!(
+            //     "[{:?}] submit work exec req {:?} {:?} TaskId: {:?}",
+            //     std::thread::current().id(),
+            //     num_ams.load(Ordering::Relaxed),
+            //     max_ams.load(Ordering::Relaxed),
+            //     _am_id
+            // );
             // println!("[{:?}] submit_am {:?}", std::thread::current().id(), am_id);
             ame.process_msg(am, am_stall_mark, false).await;
             num_ams.fetch_sub(1, Ordering::Relaxed);
@@ -269,7 +275,14 @@ impl Scheduler {
             //     std::thread::current().id(),
             //     am_id
             // );
-            // println!("[{:?}] submit work done req {:?} {:?} TaskId: {:?} {:?}", std::thread::current().id(),num_tasks.load(Ordering::Relaxed),max_tasks.load(Ordering::Relaxed),cur_task,reqs);
+            // println!(
+            //     "[{:?}] submit work done req {:?} {:?} TaskId: {:?} ",
+            //     std::thread::current().id(),
+            //     num_ams.load(Ordering::Relaxed),
+            //     max_ams.load(Ordering::Relaxed),
+            //     _am_id,
+            //     // reqs
+            // );
         };
         self.executor.submit_task(am_future);
     }
@@ -379,7 +392,11 @@ impl Scheduler {
         self.executor.submit_task(am_future);
     }
 
-    pub(crate) fn spawn_task<F>(&self, task: F) -> LamellarTask<F::Output>
+    pub(crate) fn spawn_task<F>(
+        &self,
+        task: F,
+        outstanding_reqs: Vec<Arc<AMCounters>>,
+    ) -> LamellarTask<F::Output>
     where
         F: Future + Send + 'static,
         F::Output: Send,
@@ -387,10 +404,16 @@ impl Scheduler {
         let num_tasks = self.num_tasks.clone();
         let max_tasks = self.max_tasks.clone();
         num_tasks.fetch_add(1, Ordering::Relaxed);
+        for cntr in outstanding_reqs.iter() {
+            cntr.inc_outstanding(1);
+        }
         let _task_id = max_tasks.fetch_add(1, Ordering::Relaxed);
         let future = async move {
             let result = task.await;
             num_tasks.fetch_sub(1, Ordering::Relaxed);
+            for cntr in outstanding_reqs.iter() {
+                cntr.dec_outstanding(1);
+            }
             result
         };
         self.executor.spawn_task(future, self.executor.clone())
diff --git a/src/scheduler/work_stealing.rs b/src/scheduler/work_stealing.rs
index 264d359b..0c57fbb2 100644
--- a/src/scheduler/work_stealing.rs
+++ b/src/scheduler/work_stealing.rs
@@ -142,7 +142,13 @@ impl LamellarExecutor for WorkStealing {
         let schedule = move |runnable| work_inj.push(runnable);
         let (runnable, task) = Builder::new()
             .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-            .spawn(move |_task_id| async move { task.await }, schedule);
+            .spawn(
+                move |_task_id| async move {
+                    let res = task.await;
+                    res
+                },
+                schedule,
+            );
 
         runnable.schedule();
         LamellarTask {
@@ -210,7 +216,13 @@ impl LamellarExecutor for WorkStealing {
         let (runnable, mut task) = unsafe {
             Builder::new()
                 .metadata(TASK_ID.fetch_add(1, Ordering::Relaxed))
-                .spawn_unchecked(move |_task_id| async move { fut.await }, schedule)
+                .spawn_unchecked(
+                    move |_task_id| async move {
+                        let res = fut.await;
+                        res
+                    },
+                    schedule,
+                )
         };
         let waker = runnable.waker();
         runnable.run(); //try to run immediately
diff --git a/src/warnings.rs b/src/warnings.rs
index accb47a8..76627492 100644
--- a/src/warnings.rs
+++ b/src/warnings.rs
@@ -1,7 +1,11 @@
+use std::sync::atomic::{AtomicBool, Ordering};
+
 use crate::config;
 
+static ENABLED: AtomicBool = AtomicBool::new(true);
+
 pub(crate) enum RuntimeWarning<'a> {
-    UnspanedTask(&'a str),
+    UnspawnedTask(&'a str),
     DroppedHandle(&'a str),
     BlockingCall(&'a str, &'a str),
     BlockOn,
@@ -10,30 +14,40 @@ pub(crate) enum RuntimeWarning<'a> {
 }
 
 impl<'a> RuntimeWarning<'a> {
+    pub(crate) fn enable_warnings() {
+        ENABLED.store(true, Ordering::Relaxed);
+    }
+    pub(crate) fn disable_warnings() {
+        ENABLED.store(false, Ordering::Relaxed);
+    }
     fn print_warning(&self) -> bool {
-        match self {
-            RuntimeWarning::UnspanedTask(_) => match config().unpspawned_task_warning {
-                Some(true) => true,
-                Some(false) => false,
-                None => true,
-            },
-            RuntimeWarning::DroppedHandle(_) => match config().dropped_unused_handle_warning {
-                Some(true) => true,
-                Some(false) => false,
-                None => true,
-            },
-            RuntimeWarning::BlockingCall(_, _) | RuntimeWarning::BlockOn => {
-                if std::thread::current().id() != *crate::MAIN_THREAD {
-                    match config().blocking_call_warning {
-                        Some(true) => true,
-                        Some(false) => false,
-                        None => true,
+        if ENABLED.load(Ordering::Relaxed) {
+            match self {
+                RuntimeWarning::UnspawnedTask(_) => match config().unpspawned_task_warning {
+                    Some(true) => true,
+                    Some(false) => false,
+                    None => true,
+                },
+                RuntimeWarning::DroppedHandle(_) => match config().dropped_unused_handle_warning {
+                    Some(true) => true,
+                    Some(false) => false,
+                    None => true,
+                },
+                RuntimeWarning::BlockingCall(_, _) | RuntimeWarning::BlockOn => {
+                    if std::thread::current().id() != *crate::MAIN_THREAD {
+                        match config().blocking_call_warning {
+                            Some(true) => true,
+                            Some(false) => false,
+                            None => true,
+                        }
+                    } else {
+                        false
                     }
-                } else {
-                    false
                 }
+                RuntimeWarning::BarrierTimeout(elapsed) => elapsed > &config().deadlock_timeout,
             }
-            RuntimeWarning::BarrierTimeout(elapsed) => elapsed > &config().deadlock_timeout,
+        } else {
+            false
         }
     }
 
@@ -54,7 +68,7 @@ impl<'a> RuntimeWarning<'a> {
         #[cfg(not(feature = "disable-runtime-warnings"))]
         if self.print_warning() {
             let msg = match self {
-                RuntimeWarning::UnspanedTask(msg) => {
+                RuntimeWarning::UnspawnedTask(msg) => {
                     format!("[LAMELLAR WARNING] you have called {msg}. 
                     This typically means you forgot to call spawn() on the handle returned from calls such as exec_am_* or various array operations.
                     If this is your intended behavior, set LAMELLAR_UNSPAWNED_TASK_WARNING=0 to disable this warning.")

From ccc4e7a42e00016268a8aeda6f0f876dacacfda1 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 4 Nov 2024 14:01:02 -0800
Subject: [PATCH 102/116] update LocalLock and GlobalLock array to be
 consistent with how spawning iterators work

---
 .../array_examples/distributed_iteration.rs   |  33 ++-
 examples/array_examples/global_lock_array.rs  |   4 +
 examples/hello_world/hello_world_array.rs     |   2 +-
 examples/kernels/cached_am_gemm.rs            |   2 +-
 examples/misc/ping_pong.rs                    |  18 +-
 examples/misc/simple_ptp.rs                   |   2 +-
 src/active_messaging/handle.rs                |   9 +
 src/array.rs                                  |   7 +
 src/array/atomic/iteration.rs                 |  22 +-
 src/array/generic_atomic/iteration.rs         |  20 +-
 src/array/generic_atomic/rdma.rs              |   6 +-
 src/array/global_lock_atomic.rs               |  30 +-
 src/array/global_lock_atomic/iteration.rs     | 178 +++++++++---
 src/array/global_lock_atomic/rdma.rs          |   8 +-
 src/array/handle.rs                           |  24 +-
 src/array/iterator/consumer.rs                |   4 +-
 src/array/iterator/distributed_iterator.rs    |  27 +-
 .../iterator/distributed_iterator/chunks.rs   |  14 +-
 .../distributed_iterator/consumer/collect.rs  |  42 ++-
 .../distributed_iterator/consumer/count.rs    |  42 ++-
 .../distributed_iterator/consumer/for_each.rs |  56 +++-
 .../distributed_iterator/consumer/reduce.rs   |  40 ++-
 .../distributed_iterator/consumer/sum.rs      | 115 +++-----
 .../distributed_iterator/enumerate.rs         |  13 +-
 .../iterator/distributed_iterator/filter.rs   |  13 +-
 .../distributed_iterator/filter_map.rs        |  13 +-
 .../iterator/distributed_iterator/map.rs      |  13 +-
 .../distributed_iterator/monotonic.rs         |  13 +-
 .../iterator/distributed_iterator/skip.rs     |  13 +-
 .../iterator/distributed_iterator/step_by.rs  |  13 +-
 .../iterator/distributed_iterator/take.rs     |  13 +-
 .../iterator/distributed_iterator/zip.rs      |  11 +-
 src/array/iterator/local_iterator.rs          |  27 +-
 src/array/iterator/local_iterator/chunks.rs   |  23 +-
 src/array/iterator/local_iterator/consumer.rs | 265 ------------------
 .../local_iterator/consumer/collect.rs        |  40 ++-
 .../iterator/local_iterator/consumer/count.rs |  33 ++-
 .../local_iterator/consumer/for_each.rs       |  33 ++-
 .../local_iterator/consumer/reduce.rs         |  31 +-
 .../iterator/local_iterator/consumer/sum.rs   |  95 ++-----
 .../iterator/local_iterator/enumerate.rs      |  13 +-
 src/array/iterator/local_iterator/filter.rs   |  13 +-
 .../iterator/local_iterator/filter_map.rs     |  13 +-
 src/array/iterator/local_iterator/map.rs      |  17 +-
 .../iterator/local_iterator/monotonic.rs      |  13 +-
 src/array/iterator/local_iterator/skip.rs     |  13 +-
 src/array/iterator/local_iterator/step_by.rs  |  13 +-
 src/array/iterator/local_iterator/take.rs     |  13 +-
 src/array/iterator/local_iterator/zip.rs      |  22 +-
 src/array/iterator/mod.rs                     |  12 +-
 src/array/iterator/one_sided_iterator.rs      |   9 +-
 .../iterator/one_sided_iterator/chunks.rs     |   3 +-
 src/array/local_lock_atomic.rs                |   6 +-
 src/array/local_lock_atomic/iteration.rs      | 136 +++++++--
 src/array/local_lock_atomic/local_chunks.rs   |  20 +-
 src/array/local_lock_atomic/rdma.rs           |   6 +-
 src/array/native_atomic/iteration.rs          |  22 +-
 src/array/native_atomic/rdma.rs               |   6 +-
 src/array/read_only/local_chunks.rs           |  11 +-
 src/array/unsafe.rs                           |   6 +-
 src/array/unsafe/iteration/distributed.rs     | 137 ++++-----
 src/array/unsafe/iteration/local.rs           |  64 +++--
 src/array/unsafe/local_chunks.rs              |  20 +-
 src/array/unsafe/rdma.rs                      |  10 +-
 src/barrier.rs                                |   3 +
 src/darc.rs                                   | 173 ++++++++----
 src/darc/global_rw_darc.rs                    | 115 +++++---
 src/darc/handle.rs                            |   3 +
 src/darc/local_rw_darc.rs                     |   2 +-
 src/lamellar_request.rs                       |   1 +
 src/lamellar_task_group.rs                    |   9 +
 src/lamellar_team.rs                          | 104 +++++--
 tests/array/arithmetic_ops/fetch_add_test.rs  |  36 +--
 73 files changed, 1390 insertions(+), 1021 deletions(-)

diff --git a/examples/array_examples/distributed_iteration.rs b/examples/array_examples/distributed_iteration.rs
index c14691e3..1209bc16 100644
--- a/examples/array_examples/distributed_iteration.rs
+++ b/examples/array_examples/distributed_iteration.rs
@@ -5,7 +5,7 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let _num_pes = world.num_pes();
-    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let block_array = LocalLockArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
     let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
 
     // We expose multiple ways to iterate over a lamellar array
@@ -14,6 +14,7 @@ fn main() {
     // local to that pe, thus instantiating a distributed iterator introduces a synchronoization point.
     // distributed iterators are created by calling dist_iter() or dist_iter_mut() on a LamellarArray;
 
+    let lock = block_array.write_lock().block();
     let block_dist_iter = block_array.dist_iter_mut();
     let cyclic_dist_iter = cyclic_array.dist_iter_mut();
 
@@ -21,10 +22,13 @@ fn main() {
     // we currently provide the "for_each" driver which will execute a closure on every element in the distributed array (concurrently)
 
     //for example lets initialize our arrays, where we store the value of my_pe to each local element a pe owns
-    block_dist_iter
+    let iter = block_dist_iter
         .enumerate()
-        .for_each(move |(i, elem)| elem.store(i))
-        .block();
+        .for_each(move |(i, elem)| *elem = i)
+        .spawn();
+    std::thread::sleep(std::time::Duration::from_secs(1));
+    drop(lock);
+    iter.block();
     cyclic_dist_iter
         .for_each(move |elem| elem.store(my_pe))
         .block();
@@ -35,7 +39,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block sum");
-    let sum = block_array.block_on(block_array.dist_iter().map(|e| e.load()).sum());
+    let sum = block_array.block_on(block_array.dist_iter().map(|e| *e).sum());
     println!("result: {sum}");
     world.barrier();
     println!("--------------------------------------------------------");
@@ -140,7 +144,7 @@ fn main() {
     block_array
         .dist_iter()
         .enumerate()
-        .filter(|(_, elem)| elem.load() % 4 == 0)
+        .filter(|(_, elem)| *elem % 4 == 0)
         .for_each(move |(i, elem)| {
             println!(
                 "[pe({:?})-{:?}] i: {:?} {:?}",
@@ -158,8 +162,8 @@ fn main() {
         .dist_iter()
         .enumerate()
         .filter_map(|(i, elem)| {
-            if elem.load() % 4 == 0 {
-                Some((i, elem.load() as f32))
+            if *elem % 4 == 0 {
+                Some((i, *elem as f32))
             } else {
                 None
             }
@@ -180,15 +184,15 @@ fn main() {
         block_array
             .dist_iter()
             .filter_map(|elem| {
-                let e = elem.load();
+                let e = *elem;
                 if e % 8 == 0 {
                     println!("e: {:?}", e);
-                    Some(e as f32)
+                    Some(e as u8)
                 } else {
                     None
                 }
             })
-            .collect::<ReadOnlyArray<f32>>(Distribution::Block),
+            .collect::<ReadOnlyArray<u8>>(Distribution::Block),
     );
 
     new_block_array.print();
@@ -267,11 +271,6 @@ fn main() {
 
     println!("--------------------------------------------------------");
     println!("block filter count");
-    let count = block_array.block_on(
-        block_array
-            .dist_iter()
-            .filter(|e| e.load() % 2 == 0)
-            .count(),
-    );
+    let count = block_array.block_on(block_array.dist_iter().filter(|e| *e % 2 == 0).count());
     println!("result: {count}");
 }
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 82cbe5b6..6b86f4e4 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -61,6 +61,7 @@ fn main() {
         })
         .block();
     world.barrier();
+    println!("6. PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
     let task = array
         .dist_iter_mut()
@@ -68,6 +69,9 @@ fn main() {
         .for_each(|(i, elem)| *elem += i);
     world.block_on(task);
     world.barrier();
+    println!("7. PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 
     array.print();
+
+    println!("8. PE{my_pe} time: {:?} done", s.elapsed().as_secs_f64());
 }
diff --git a/examples/hello_world/hello_world_array.rs b/examples/hello_world/hello_world_array.rs
index f483fbd8..1d0620b6 100644
--- a/examples/hello_world/hello_world_array.rs
+++ b/examples/hello_world/hello_world_array.rs
@@ -9,7 +9,7 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let num_pes = world.num_pes();
     let _my_pe = world.my_pe();
-    let local_length = 1_000_000_000; //if you want to ensure each thread processes data make this >= LAMELLAR_THREADS environment variable
+    let local_length = 1_000_000; //if you want to ensure each thread processes data make this >= LAMELLAR_THREADS environment variable
     let global_length = num_pes * local_length;
     let init_time = timer.elapsed();
     println!("init_time: {:?}", init_time);
diff --git a/examples/kernels/cached_am_gemm.rs b/examples/kernels/cached_am_gemm.rs
index 4e5c7567..ce415a97 100644
--- a/examples/kernels/cached_am_gemm.rs
+++ b/examples/kernels/cached_am_gemm.rs
@@ -251,7 +251,7 @@ fn main() {
                     c: c_block.clone(),
                     a_pe_rows: a_pe_rows,
                     block_size: block_size,
-                }));
+                }).spawn());
                 tasks += 1;
             }
             // for req in reqs {
diff --git a/examples/misc/ping_pong.rs b/examples/misc/ping_pong.rs
index 5a991406..74ae33f8 100644
--- a/examples/misc/ping_pong.rs
+++ b/examples/misc/ping_pong.rs
@@ -409,13 +409,17 @@ fn main() {
     // if my_pe == 0 {
     for _thread in 0..1 {
         //world.num_threads_per_pe() {
-        reqs.push(world.exec_am_local(MyAm {
-            indices: indices.clone(),
-            buffers: buffers.clone(),
-            buffer_size,
-            table_size_per_pe: table_size_per_pe,
-            comm_lock: comm_lock.clone(),
-        }));
+        reqs.push(
+            world
+                .exec_am_local(MyAm {
+                    indices: indices.clone(),
+                    buffers: buffers.clone(),
+                    buffer_size,
+                    table_size_per_pe: table_size_per_pe,
+                    comm_lock: comm_lock.clone(),
+                })
+                .spawn(),
+        );
     }
     world.block_on_all(reqs);
     // }
diff --git a/examples/misc/simple_ptp.rs b/examples/misc/simple_ptp.rs
index 2689b328..08f2439b 100644
--- a/examples/misc/simple_ptp.rs
+++ b/examples/misc/simple_ptp.rs
@@ -62,7 +62,7 @@ fn main() {
     let mut reqs = Vec::new();
     let num_tasks = 100;
     for _i in 0..num_tasks {
-        reqs.push(world.exec_am_pe(0, SyncAM {}));
+        reqs.push(world.exec_am_pe(0, SyncAM {}).spawn());
     }
     world.wait_all();
     world.barrier();
diff --git a/src/active_messaging/handle.rs b/src/active_messaging/handle.rs
index b26cd372..62c3c1c9 100644
--- a/src/active_messaging/handle.rs
+++ b/src/active_messaging/handle.rs
@@ -162,6 +162,9 @@ impl<T: AmDist> AmHandle<T> {
 }
 
 impl<T: AmDist> LamellarRequest for AmHandle<T> {
+    fn launch(&mut self) {
+        self.launch_am_if_needed();
+    }
     fn blocking_wait(mut self) -> T {
         self.launch_am_if_needed();
         while !self.inner.ready.load(Ordering::SeqCst) {
@@ -301,6 +304,9 @@ impl<T: AmDist> From<LocalAmHandle<T>> for AmHandle<T> {
 }
 
 impl<T: 'static> LamellarRequest for LocalAmHandle<T> {
+    fn launch(&mut self) {
+        self.launch_am_if_needed();
+    }
     fn blocking_wait(mut self) -> T {
         self.launch_am_if_needed();
         while !self.inner.ready.load(Ordering::SeqCst) {
@@ -485,6 +491,9 @@ impl<T: AmDist> MultiAmHandle<T> {
 }
 
 impl<T: AmDist> LamellarRequest for MultiAmHandle<T> {
+    fn launch(&mut self) {
+        self.launch_am_if_needed();
+    }
     fn blocking_wait(mut self) -> Self::Output {
         self.launch_am_if_needed();
         while self.inner.cnt.load(Ordering::SeqCst) > 0 {
diff --git a/src/array.rs b/src/array.rs
index e58dcefc..e848379d 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -1008,6 +1008,13 @@ pub(crate) mod private {
             self.team()
                 .exec_am_pe_tg(pe, am, Some(self.team_counters()))
         }
+        fn spawn_am_pe_tg<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
+        where
+            F: RemoteActiveMessage + LamellarAM + AmDist,
+        {
+            self.team()
+                .spawn_am_pe_tg(pe, am, Some(self.team_counters()))
+        }
         // fn exec_arc_am_pe<F>(&self, pe: usize, am: LamellarArcAm) -> AmHandle<F>
         // where
         //     F: AmDist,
diff --git a/src/array/atomic/iteration.rs b/src/array/atomic/iteration.rs
index 3dd68d87..73c2a231 100644
--- a/src/array/atomic/iteration.rs
+++ b/src/array/atomic/iteration.rs
@@ -4,13 +4,15 @@ use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
-    private::{IterClone, Sealed},
+    private::{InnerIter, Sealed},
     LamellarArrayIterators, LamellarArrayMutIterators,
 };
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
 
+use self::iterator::IterLockFuture;
+
 impl<T: Dist> InnerArray for AtomicArray<T> {
     fn as_inner(&self) -> &UnsafeArrayInner {
         match &self {
@@ -28,8 +30,11 @@ pub struct AtomicDistIter<T: Dist> {
     end_i: usize,
 }
 
-impl<T: Dist> IterClone for AtomicDistIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for AtomicDistIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         AtomicDistIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -69,8 +74,11 @@ pub struct AtomicLocalIter<T: Dist> {
     end_i: usize,
 }
 
-impl<T: Dist> IterClone for AtomicLocalIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for AtomicLocalIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         AtomicLocalIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -105,7 +113,7 @@ impl<T: Dist> AtomicLocalIter<T> {
 impl<T: Dist> DistributedIterator for AtomicDistIter<T> {
     type Item = AtomicElement<T>;
     type Array = AtomicArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         // println!("num_elems_local: {:?}",self.data.num_elems_local());
@@ -144,7 +152,7 @@ impl<T: Dist> IndexedDistributedIterator for AtomicDistIter<T> {
 impl<T: Dist> LocalIterator for AtomicLocalIter<T> {
     type Item = AtomicElement<T>;
     type Array = AtomicArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init atomic start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?} {:?}",start_i,cnt, start_i+cnt,max_i,std::thread::current().id());
 
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index e6fbab98..4b777940 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -7,6 +7,8 @@ use crate::array::iterator::{private::*, LamellarArrayIterators, LamellarArrayMu
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
+
+use self::iterator::IterLockFuture;
 // use parking_lot::{
 //     lock_api::{RwLockReadGuardArc, RwLockWriteGuardArc},
 //     RawRwLock,
@@ -26,8 +28,11 @@ pub struct GenericAtomicDistIter<T: Dist> {
     end_i: usize,
 }
 
-impl<T: Dist> IterClone for GenericAtomicDistIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for GenericAtomicDistIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         GenericAtomicDistIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -56,8 +61,11 @@ pub struct GenericAtomicLocalIter<T: Dist> {
     end_i: usize,
 }
 
-impl<T: Dist> IterClone for GenericAtomicLocalIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for GenericAtomicLocalIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         GenericAtomicLocalIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -81,7 +89,7 @@ impl<T: Dist> std::fmt::Debug for GenericAtomicLocalIter<T> {
 impl<T: Dist> DistributedIterator for GenericAtomicDistIter<T> {
     type Item = GenericAtomicElement<T>;
     type Array = GenericAtomicArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         GenericAtomicDistIter {
@@ -129,7 +137,7 @@ impl<T: Dist> IndexedDistributedIterator for GenericAtomicDistIter<T> {
 impl<T: Dist> LocalIterator for GenericAtomicLocalIter<T> {
     type Item = GenericAtomicElement<T>;
     type Array = GenericAtomicArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init generic_atomic start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?} {:?}",start_i,cnt, start_i+cnt,max_i,std::thread::current().id());
         GenericAtomicLocalIter {
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 20e89cbb..1243e754 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -122,7 +122,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -253,7 +253,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -306,7 +306,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                        reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 51beb48e..d2822d9e 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -24,7 +24,7 @@ use crate::warnings::RuntimeWarning;
 use pin_project::pin_project;
 
 use std::ops::{Deref, DerefMut};
-use std::task::{Context, Poll, Waker};
+use std::task::{Context, Poll};
 
 /// A safe abstraction of a distributed array, providing read/write access protected by locks.
 ///
@@ -1025,7 +1025,8 @@ impl<T: Dist + AmDist> GlobalLockArrayReduceHandle<T> {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<Option<T>> {
+    pub fn spawn(mut self) -> LamellarTask<Option<T>> {
+        self.req.launch();
         self.lock_guard.array.clone().spawn(self)
     }
 
@@ -1040,17 +1041,20 @@ impl<T: Dist + AmDist> GlobalLockArrayReduceHandle<T> {
     }
 }
 
-impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
-    fn blocking_wait(self) -> Self::Output {
-        self.req.blocking_wait()
-    }
-    fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-        self.req.ready_or_set_waker(waker)
-    }
-    fn val(&self) -> Self::Output {
-        self.req.val()
-    }
-}
+// impl<T: Dist + AmDist> LamellarRequest for GlobalLockArrayReduceHandle<T> {
+//     fn launch(&mut self) {
+//         self.req.launch();
+//     }
+//     fn blocking_wait(self) -> Self::Output {
+//         self.req.blocking_wait()
+//     }
+//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
+//         self.req.ready_or_set_waker(waker)
+//     }
+//     fn val(&self) -> Self::Output {
+//         self.req.val()
+//     }
+// }
 
 impl<T: Dist + AmDist> Future for GlobalLockArrayReduceHandle<T> {
     type Output = Option<T>;
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index a10d2376..b91669b9 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -1,10 +1,12 @@
+use parking_lot::Mutex;
+
 use crate::array::global_lock_atomic::*;
 
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
-    private::{IterClone, Sealed},
+    private::{InnerIter, Sealed},
     LamellarArrayIterators, LamellarArrayMutIterators,
 };
 use crate::array::private::LamellarArrayPrivate;
@@ -12,6 +14,8 @@ use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
 
+use self::iterator::IterLockFuture;
+
 impl<T> InnerArray for GlobalLockArray<T> {
     fn as_inner(&self) -> &UnsafeArrayInner {
         &self.array.inner
@@ -21,16 +25,30 @@ impl<T> InnerArray for GlobalLockArray<T> {
 //#[doc(hidden)]
 #[derive(Clone)]
 pub struct GlobalLockDistIter<T: Dist> {
-    array_guard: GlobalLockReadGuard<T>,
+    data: GlobalLockArray<T>,
+    lock: Arc<Mutex<Option<GlobalRwDarcReadGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'static T>,
 }
 
-impl<T: Dist> IterClone for GlobalLockDistIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for GlobalLockDistIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        if self.lock.lock().is_none() {
+            let lock_handle = self.data.lock.read();
+            let lock = self.lock.clone();
+
+            Some(Box::pin(async move {
+                *lock.lock() = Some(lock_handle.await);
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         GlobalLockDistIter {
-            array_guard: self.array_guard.clone(),
+            data: self.data.clone(),
+            lock: self.lock.clone(),
             cur_i: self.cur_i,
             end_i: self.end_i,
             _marker: PhantomData,
@@ -43,7 +61,7 @@ impl<T: Dist> std::fmt::Debug for GlobalLockDistIter<T> {
         write!(
             f,
             "GlobalLockDistIter{{ data.len: {:?}, cur_i: {:?}, end_i: {:?} }}",
-            self.array_guard.array.len(),
+            self.data.len(),
             self.cur_i,
             self.end_i
         )
@@ -53,16 +71,29 @@ impl<T: Dist> std::fmt::Debug for GlobalLockDistIter<T> {
 //#[doc(hidden)]
 #[derive(Clone)]
 pub struct GlobalLockLocalIter<T: Dist> {
-    array_guard: GlobalLockReadGuard<T>,
+    data: GlobalLockArray<T>,
+    lock: Arc<Mutex<Option<GlobalRwDarcReadGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'static T>,
 }
 
-impl<T: Dist> IterClone for GlobalLockLocalIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for GlobalLockLocalIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        if self.lock.lock().is_none() {
+            let lock_handle = self.data.lock.read();
+            let lock = self.lock.clone();
+            Some(Box::pin(async move {
+                *lock.lock() = Some(lock_handle.await);
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         GlobalLockLocalIter {
-            array_guard: self.array_guard.clone(),
+            data: self.data.clone(),
+            lock: self.lock.clone(),
             cur_i: self.cur_i,
             end_i: self.end_i,
             _marker: PhantomData,
@@ -75,7 +106,7 @@ impl<T: Dist> std::fmt::Debug for GlobalLockLocalIter<T> {
         write!(
             f,
             "GlobalLockLocalIter{{ data.len: {:?}, cur_i: {:?}, end_i: {:?} }}",
-            self.array_guard.array.len(),
+            self.data.len(),
             self.cur_i,
             self.end_i
         )
@@ -85,25 +116,25 @@ impl<T: Dist> std::fmt::Debug for GlobalLockLocalIter<T> {
 impl<T: Dist + 'static> DistributedIterator for GlobalLockDistIter<T> {
     type Item = &'static T;
     type Array = GlobalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
-        let max_i = self.array_guard.array.num_elems_local();
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
+        let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         GlobalLockDistIter {
-            array_guard: self.array_guard.clone(),
+            data: self.data.clone(),
+            lock: self.lock.clone(),
             cur_i: std::cmp::min(start_i, max_i),
             end_i: std::cmp::min(start_i + cnt, max_i),
             _marker: PhantomData,
         }
     }
     fn array(&self) -> Self::Array {
-        self.array_guard.array.clone()
+        self.data.clone()
     }
     fn next(&mut self) -> Option<Self::Item> {
         if self.cur_i < self.end_i {
             self.cur_i += 1;
             unsafe {
-                self.array_guard
-                    .array
+                self.data
                     .array
                     .local_as_ptr()
                     .offset((self.cur_i - 1) as isize)
@@ -122,7 +153,7 @@ impl<T: Dist + 'static> DistributedIterator for GlobalLockDistIter<T> {
 }
 impl<T: Dist + 'static> IndexedDistributedIterator for GlobalLockDistIter<T> {
     fn iterator_index(&self, index: usize) -> Option<usize> {
-        let g_index = self.array_guard.array.subarray_index_from_local(index, 1);
+        let g_index = self.data.subarray_index_from_local(index, 1);
         g_index
     }
 }
@@ -130,25 +161,25 @@ impl<T: Dist + 'static> IndexedDistributedIterator for GlobalLockDistIter<T> {
 impl<T: Dist + 'static> LocalIterator for GlobalLockLocalIter<T> {
     type Item = &'static T;
     type Array = GlobalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
-        let max_i = self.array_guard.array.num_elems_local();
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
+        let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         GlobalLockLocalIter {
-            array_guard: self.array_guard.clone(),
+            data: self.data.clone(),
+            lock: self.lock.clone(),
             cur_i: std::cmp::min(start_i, max_i),
             end_i: std::cmp::min(start_i + cnt, max_i),
             _marker: PhantomData,
         }
     }
     fn array(&self) -> Self::Array {
-        self.array_guard.array.clone()
+        self.data.clone()
     }
     fn next(&mut self) -> Option<Self::Item> {
         if self.cur_i < self.end_i {
             self.cur_i += 1;
             unsafe {
-                self.array_guard
-                    .array
+                self.data
                     .array
                     .local_as_ptr()
                     .offset((self.cur_i - 1) as isize)
@@ -169,7 +200,7 @@ impl<T: Dist + 'static> LocalIterator for GlobalLockLocalIter<T> {
 
 impl<T: Dist + 'static> IndexedLocalIterator for GlobalLockLocalIter<T> {
     fn iterator_index(&self, index: usize) -> Option<usize> {
-        if index < self.array_guard.array.len() {
+        if index < self.data.len() {
             Some(index) //everyone at this point as calculated the actual index (cause we are local only) so just return it
         } else {
             None
@@ -179,14 +210,26 @@ impl<T: Dist + 'static> IndexedLocalIterator for GlobalLockLocalIter<T> {
 
 pub struct GlobalLockDistIterMut<T: Dist> {
     data: GlobalLockArray<T>,
-    lock: Arc<GlobalRwDarcCollectiveWriteGuard<()>>,
+    lock: Arc<Mutex<Option<GlobalRwDarcCollectiveWriteGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'static T>,
 }
 
-impl<T: Dist> IterClone for GlobalLockDistIterMut<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for GlobalLockDistIterMut<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        if self.lock.lock().is_none() {
+            let lock_handle = self.data.lock.collective_write();
+            let lock = self.lock.clone();
+
+            Some(Box::pin(async move {
+                *lock.lock() = Some(lock_handle.await);
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         GlobalLockDistIterMut {
             data: self.data.clone(),
             lock: self.lock.clone(),
@@ -211,14 +254,26 @@ impl<T: Dist> std::fmt::Debug for GlobalLockDistIterMut<T> {
 
 pub struct GlobalLockLocalIterMut<T: Dist> {
     data: GlobalLockArray<T>,
-    lock: Arc<GlobalRwDarcWriteGuard<()>>,
+    lock: Arc<Mutex<Option<GlobalRwDarcWriteGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'static T>,
 }
 
-impl<T: Dist> IterClone for GlobalLockLocalIterMut<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for GlobalLockLocalIterMut<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        if self.lock.lock().is_none() {
+            let lock_handle = self.data.lock.write();
+            let lock = self.lock.clone();
+
+            Some(Box::pin(async move {
+                *lock.lock() = Some(lock_handle.await);
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         GlobalLockLocalIterMut {
             data: self.data.clone(),
             lock: self.lock.clone(),
@@ -244,7 +299,7 @@ impl<T: Dist> std::fmt::Debug for GlobalLockLocalIterMut<T> {
 impl<T: Dist + 'static> DistributedIterator for GlobalLockDistIterMut<T> {
     type Item = &'static mut T;
     type Array = GlobalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         GlobalLockDistIterMut {
@@ -293,7 +348,7 @@ impl<T: Dist + 'static> IndexedDistributedIterator for GlobalLockDistIterMut<T>
 impl<T: Dist + 'static> LocalIterator for GlobalLockLocalIterMut<T> {
     type Item = &'static mut T;
     type Array = GlobalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         GlobalLockLocalIterMut {
@@ -350,7 +405,46 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockReadGuard<T> {
 
     fn dist_iter(&self) -> Self::DistIter {
         GlobalLockDistIter {
-            array_guard: self.clone(),
+            data: self.array.clone(),
+            lock: Arc::new(Mutex::new(Some(self.lock_guard.clone()))),
+            cur_i: 0,
+            end_i: 0,
+            _marker: PhantomData,
+        }
+    }
+
+    fn local_iter(&self) -> Self::LocalIter {
+        GlobalLockLocalIter {
+            data: self.array.clone(),
+            lock: Arc::new(Mutex::new(Some(self.lock_guard.clone()))),
+            cur_i: 0,
+            end_i: 0,
+            _marker: PhantomData,
+        }
+    }
+
+    fn onesided_iter(&self) -> Self::OnesidedIter {
+        OneSidedIter::new(self.array.clone().into(), self.array.team_rt().clone(), 1)
+    }
+
+    fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
+        OneSidedIter::new(
+            self.array.clone().into(),
+            self.array.team_rt().clone(),
+            std::cmp::min(buf_size, self.array.len()),
+        )
+    }
+}
+
+impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
+    type DistIter = GlobalLockDistIter<T>;
+    type LocalIter = GlobalLockLocalIter<T>;
+    type OnesidedIter = OneSidedIter<'static, T, GlobalLockArray<T>>;
+
+    fn dist_iter(&self) -> Self::DistIter {
+        GlobalLockDistIter {
+            data: self.clone(),
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -359,7 +453,8 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockReadGuard<T> {
 
     fn local_iter(&self) -> Self::LocalIter {
         GlobalLockLocalIter {
-            array_guard: self.clone(),
+            data: self.clone(),
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -384,16 +479,9 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     type LocalIter = GlobalLockLocalIterMut<T>;
 
     fn dist_iter_mut(&self) -> Self::DistIter {
-        let lock: GlobalRwDarc<()> = self.lock.clone();
-        let lock = Arc::new(
-            self.array
-                .block_on(async move { lock.collective_write().await }),
-        );
-        // self.barrier();
-        // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         GlobalLockDistIterMut {
             data: self.clone(),
-            lock: lock,
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -401,11 +489,9 @@ impl<T: Dist> LamellarArrayMutIterators<T> for GlobalLockArray<T> {
     }
 
     fn local_iter_mut(&self) -> Self::LocalIter {
-        let lock: GlobalRwDarc<()> = self.lock.clone();
-        let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
         GlobalLockLocalIterMut {
             data: self.clone(),
-            lock: lock,
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index c38adfa3..dddbefcc 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -132,7 +132,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -237,7 +237,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                         .into(),
                                     pe: self.array.my_pe(),
                                 };
-                                reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                                reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                             } else {
                                 let remote_am = GlobalLockRemoteSmallPutAm {
                                     array: self.array.clone().into(), //inner of the indices we need to place data into
@@ -247,7 +247,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                         [cur_index..(cur_index + u8_buf_len)]
                                         .to_vec(),
                                 };
-                                reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                                reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                             }
                             cur_index += u8_buf_len;
                         } else {
@@ -302,7 +302,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                        reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/handle.rs b/src/array/handle.rs
index c80eb0af..cdfc9e74 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -40,7 +40,11 @@ impl ArrayRdmaHandle {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<()> {
+    pub fn spawn(mut self) -> LamellarTask<()> {
+        for req in self.reqs.iter_mut() {
+            req.launch();
+        }
+        self.spawned = true;
         self.array.team().spawn(self)
     }
 
@@ -56,6 +60,12 @@ impl ArrayRdmaHandle {
 }
 
 impl LamellarRequest for ArrayRdmaHandle {
+    fn launch(&mut self) -> Self::Output {
+        for req in self.reqs.iter_mut() {
+            req.launch();
+        }
+        self.spawned = true;
+    }
     fn blocking_wait(mut self) -> Self::Output {
         self.spawned = true;
         for req in self.reqs.drain(0..) {
@@ -124,6 +134,9 @@ impl<T: Dist> ArrayRdmaAtHandle<T> {
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
     pub fn spawn(mut self) -> LamellarTask<T> {
+        if let Some(req) = &mut self.req {
+            req.launch();
+        }
         self.spawned = true;
         self.array.team().spawn(self)
     }
@@ -141,6 +154,12 @@ impl<T: Dist> ArrayRdmaAtHandle<T> {
 }
 
 impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
+    fn launch(&mut self) {
+        if let Some(req) = &mut self.req {
+            req.launch();
+        }
+        self.spawned = true;
+    }
     fn blocking_wait(mut self) -> Self::Output {
         self.spawned = true;
         if let Some(req) = self.req.take() {
@@ -167,7 +186,8 @@ impl<T: Dist> LamellarRequest for ArrayRdmaAtHandle<T> {
 
 impl<T: Dist> Future for ArrayRdmaAtHandle<T> {
     type Output = T;
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.spawned = true;
         let mut this = self.project();
         match &mut this.req {
             Some(req) => {
diff --git a/src/array/iterator/consumer.rs b/src/array/iterator/consumer.rs
index 83150174..9826e737 100644
--- a/src/array/iterator/consumer.rs
+++ b/src/array/iterator/consumer.rs
@@ -14,9 +14,11 @@ use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
+use super::private::Sealed;
+
 // trait Consumer{
 //     type Item;
-//     fn init(&self, start: usize, cnt: usize) -> Self;
+//     fn init(&self, start: usize, cnt: usize, _s: Sealed) -> Self;
 //     fn monotonic(&self) -> Self;
 //     fn next(&self) -> Self::Item;
 // }
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index fe7ee6d4..a515a195 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -48,6 +48,8 @@ use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 
+use super::IterLockFuture;
+
 macro_rules! consumer_impl {
     ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ) => {
         fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($return_type)*
@@ -148,7 +150,7 @@ pub trait DistIteratorLauncher: InnerArray {
 /// The functions in this trait are available on all distributed iterators.
 /// Additonaly functionality can be found in the [IndexedDistributedIterator] trait:
 /// these methods are only available for distributed iterators where the number of elements is known in advance (e.g. after invoking `filter` these methods would be unavailable)
-pub trait DistributedIterator: SyncSend + IterClone + 'static {
+pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// The type of item this distributed iterator produces
     type Item: Send;
 
@@ -156,7 +158,8 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
     type Array: DistIteratorLauncher;
 
     /// Internal method used to initalize this distributed iterator to the correct element and correct length.
-    fn init(&self, start_i: usize, cnt: usize) -> Self;
+    #[doc(hidden)]
+    fn init(&self, start_i: usize, cnt: usize, sealed: Sealed) -> Self;
 
     /// Return the original array this distributed iterator belongs too
     fn array(&self) -> Self::Array;
@@ -769,7 +772,7 @@ pub trait DistributedIterator: SyncSend + IterClone + 'static {
 }
 
 /// An interface for dealing with distributed iterators which are indexable, meaning it returns an iterator of known length
-pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + IterClone + 'static {
+pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + InnerIter + 'static {
     /// yields the global array index along with each element
     ///
     /// # Examples
@@ -894,8 +897,11 @@ pub struct DistIter<'a, T: Dist + 'static, A: LamellarArray<T>> {
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist, A: LamellarArray<T>> IterClone for DistIter<'a, T, A> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist, A: LamellarArray<T>> InnerIter for DistIter<'a, T, A> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         DistIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -936,7 +942,7 @@ impl<
 {
     type Item = &'static T;
     type Array = A;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("{:?} DistIter init {start_i} {cnt} {} {}",std::thread::current().id(), start_i+cnt,max_i);
         DistIter {
@@ -1006,8 +1012,11 @@ pub struct DistIterMut<'a, T: Dist, A: LamellarArray<T>> {
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist, A: LamellarArray<T>> IterClone for DistIterMut<'a, T, A> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist, A: LamellarArray<T>> InnerIter for DistIterMut<'a, T, A> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         DistIterMut {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -1047,7 +1056,7 @@ impl<
 {
     type Item = &'static mut T;
     type Array = A;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("dist iter init {:?} {:?} {:?}",start_i,cnt,max_i);
         DistIterMut {
diff --git a/src/array/iterator/distributed_iterator/chunks.rs b/src/array/iterator/distributed_iterator/chunks.rs
index c8d9dc9f..a1e7de9a 100644
--- a/src/array/iterator/distributed_iterator/chunks.rs
+++ b/src/array/iterator/distributed_iterator/chunks.rs
@@ -29,11 +29,14 @@ where
 {
     type Item = Chunk<I>;
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Chunks<I> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Chunks<I> {
         // println!("init chunks start_i: {:?} cnt {:?} end_i: {:?} chunk_size: {:?} chunk_size(): {:?}",start_i,cnt, start_i+cnt,self.chunk_size,self.chunk_size());
         Chunks::new(
-            self.iter
-                .init(start_i * self.chunk_size, (start_i + cnt) * self.chunk_size),
+            self.iter.init(
+                start_i * self.chunk_size,
+                (start_i + cnt) * self.chunk_size,
+                _s,
+            ),
             start_i,
             cnt,
             self.chunk_size,
@@ -47,7 +50,10 @@ where
         if self.cur_i < self.end_i {
             // let size = std::cmp::min(self.chunk_size, self.end_i-self.cur_i);
             let start_i = self.cur_i * self.chunk_size;
-            let iter = self.iter.iter_clone(Sealed).init(start_i, self.chunk_size);
+            let iter = self
+                .iter
+                .iter_clone(Sealed)
+                .init(start_i, self.chunk_size, _s);
             // println!("new Chunk {:?} {:?} {:?} {:?}",self.cur_i, self.end_i, start_i,start_i+self.chunk_size);
             let chunk = Chunk { iter: iter };
             self.cur_i += 1;
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 1a6abdf2..a1766780 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::{DistributedIterator, Monotonic};
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::operations::ArrayOps;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
@@ -28,8 +28,11 @@ pub(crate) struct Collect<I, A> {
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<I: IterClone, A> IterClone for Collect<I, A> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, A> InnerIter for Collect<I, A> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Collect {
             iter: self.iter.iter_clone(Sealed),
             distribution: self.distribution.clone(),
@@ -50,7 +53,7 @@ where
     type Handle = InnerDistIterCollectHandle<I::Item, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Collect {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             distribution: self.distribution.clone(),
             _phantom: self._phantom.clone(),
         }
@@ -89,8 +92,11 @@ pub(crate) struct CollectAsync<I, A, B> {
     pub(crate) _phantom: PhantomData<(A, B)>,
 }
 
-impl<I: IterClone, A, B> IterClone for CollectAsync<I, A, B> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, A, B> InnerIter for CollectAsync<I, A, B> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         CollectAsync {
             iter: self.iter.iter_clone(Sealed),
             distribution: self.distribution.clone(),
@@ -112,7 +118,7 @@ where
     type Handle = InnerDistIterCollectHandle<B, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         CollectAsync {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             distribution: self.distribution.clone(),
             _phantom: self._phantom.clone(),
         }
@@ -270,14 +276,18 @@ where
     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
 {
     pub(crate) fn new(
-        barrier_handle: BarrierHandle,
+        lock: Option<IterLockFuture>,
         inner: Pin<Box<dyn Future<Output = InnerDistIterCollectHandle<T, A>> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
+        let state = match lock {
+            Some(inner_lock) => State::Lock(inner_lock, Some(inner)),
+            None => State::Barrier(array.barrier_handle(), inner),
+        };
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Barrier(barrier_handle, inner),
+            state,
         }
     }
 
@@ -305,6 +315,10 @@ where
 
 #[pin_project(project = StateProj)]
 enum State<T, A> {
+    Lock(
+        #[pin] IterLockFuture,
+        Option<Pin<Box<dyn Future<Output = InnerDistIterCollectHandle<T, A>> + Send>>>,
+    ),
     Barrier(
         #[pin] BarrierHandle,
         Pin<Box<dyn Future<Output = InnerDistIterCollectHandle<T, A>> + Send>>,
@@ -322,6 +336,16 @@ where
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
+            StateProj::Lock(lock, inner) => {
+                ready!(lock.poll(cx));
+                let barrier = this.array.barrier_handle();
+                *this.state = State::Barrier(
+                    barrier,
+                    inner.take().expect("reqs should still be in this state"),
+                );
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
             StateProj::Barrier(barrier, inner) => {
                 ready!(barrier.poll(cx));
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index ca826655..d3a7a7b6 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::LamellarArcLocalAm;
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::barrier::BarrierHandle;
@@ -28,8 +28,11 @@ pub(crate) struct Count<I> {
     pub(crate) iter: I,
 }
 
-impl<I: IterClone> IterClone for Count<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Count<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Count {
             iter: self.iter.iter_clone(Sealed),
         }
@@ -46,7 +49,7 @@ where
     type Handle = InnerDistIterCountHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Count {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
         }
     }
     fn next(&mut self) -> Option<Self::Item> {
@@ -192,14 +195,18 @@ impl PinnedDrop for DistIterCountHandle {
 
 impl DistIterCountHandle {
     pub(crate) fn new(
-        barrier_handle: BarrierHandle,
+        lock: Option<IterLockFuture>,
         inner: Pin<Box<dyn Future<Output = InnerDistIterCountHandle> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
+        let state = match lock {
+            Some(inner_lock) => State::Lock(inner_lock, Some(inner)),
+            None => State::Barrier(array.barrier_handle(), inner),
+        };
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Barrier(barrier_handle, inner),
+            state,
         }
     }
 
@@ -227,6 +234,10 @@ impl DistIterCountHandle {
 
 #[pin_project(project = StateProj)]
 enum State {
+    Lock(
+        #[pin] IterLockFuture,
+        Option<Pin<Box<dyn Future<Output = InnerDistIterCountHandle> + Send>>>,
+    ),
     Barrier(
         #[pin] BarrierHandle,
         Pin<Box<dyn Future<Output = InnerDistIterCountHandle> + Send>>,
@@ -240,6 +251,16 @@ impl Future for DistIterCountHandle {
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
+            StateProj::Lock(lock, inner) => {
+                ready!(lock.poll(cx));
+                let mut barrier = this.array.barrier_handle();
+                *this.state = State::Barrier(
+                    barrier,
+                    inner.take().expect("reqs should still be in this state"),
+                );
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
             StateProj::Barrier(barrier, inner) => {
                 ready!(barrier.poll(cx));
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
@@ -266,11 +287,14 @@ pub(crate) struct CountAm<I> {
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I> IterClone for CountAm<I>
+impl<I> InnerIter for CountAm<I>
 where
-    I: IterClone,
+    I: InnerIter,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         CountAm {
             iter: self.iter.iter_clone(Sealed),
             schedule: self.schedule.clone(),
diff --git a/src/array/iterator/distributed_iterator/consumer/for_each.rs b/src/array/iterator/distributed_iterator/consumer/for_each.rs
index 750100ee..c83d781c 100644
--- a/src/array/iterator/distributed_iterator/consumer/for_each.rs
+++ b/src/array/iterator/distributed_iterator/consumer/for_each.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
@@ -27,12 +27,15 @@ where
     pub(crate) op: F,
 }
 
-impl<I, F> IterClone for ForEach<I, F>
+impl<I, F> InnerIter for ForEach<I, F>
 where
     I: DistributedIterator + 'static,
     F: Fn(I::Item) + SyncSend + Clone + 'static,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ForEach {
             iter: self.iter.iter_clone(Sealed),
             op: self.op.clone(),
@@ -51,7 +54,7 @@ where
     type Handle = InnerDistIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEach {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             op: self.op.clone(),
         }
     }
@@ -92,13 +95,16 @@ where
     // pub(crate) _phantom: PhantomData<Fut>,
 }
 
-impl<I, F, Fut> IterClone for ForEachAsync<I, F, Fut>
+impl<I, F, Fut> InnerIter for ForEachAsync<I, F, Fut>
 where
     I: DistributedIterator + 'static,
     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
     Fut: Future<Output = ()> + Send + 'static,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ForEachAsync {
             iter: self.iter.iter_clone(Sealed),
             op: self.op.clone(),
@@ -118,7 +124,7 @@ where
     type Handle = InnerDistIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEachAsync {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             op: self.op.clone(),
         }
     }
@@ -210,14 +216,18 @@ impl PinnedDrop for DistIterForEachHandle {
 
 impl DistIterForEachHandle {
     pub(crate) fn new(
-        barrier: BarrierHandle,
+        lock: Option<IterLockFuture>,
         reqs: Pin<Box<dyn Future<Output = InnerDistIterForEachHandle> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
+        let state = match lock {
+            Some(inner_lock) => State::Lock(inner_lock, Some(reqs)),
+            None => State::Barrier(array.barrier_handle(), reqs),
+        };
         DistIterForEachHandle {
             array: array.clone(),
             launched: false,
-            state: State::Barrier(barrier, reqs),
+            state,
         }
     }
 
@@ -244,6 +254,10 @@ impl DistIterForEachHandle {
 
 #[pin_project(project = StateProj)]
 enum State {
+    Lock(
+        #[pin] IterLockFuture,
+        Option<Pin<Box<dyn Future<Output = InnerDistIterForEachHandle> + Send>>>,
+    ),
     Barrier(
         #[pin] BarrierHandle,
         Pin<Box<dyn Future<Output = InnerDistIterForEachHandle> + Send>>,
@@ -258,6 +272,16 @@ impl Future for DistIterForEachHandle {
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
+            StateProj::Lock(lock, inner) => {
+                ready!(lock.poll(cx));
+                let barrier = this.array.barrier_handle();
+                *this.state = State::Barrier(
+                    barrier,
+                    inner.take().expect("reqs should still be in this state"),
+                );
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
             StateProj::Barrier(barrier, inner) => {
                 let barrier_id = barrier.barrier_id;
                 // println!("in task barrier {:?}", barrier_id);
@@ -314,12 +338,15 @@ where
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I, F> IterClone for ForEachAm<I, F>
+impl<I, F> InnerIter for ForEachAm<I, F>
 where
     I: DistributedIterator + 'static,
     F: Fn(I::Item) + SyncSend + Clone + 'static,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ForEachAm {
             op: self.op.clone(),
             iter: self.iter.iter_clone(Sealed),
@@ -356,13 +383,16 @@ where
     // pub(crate) _phantom: PhantomData<Fut>
 }
 
-impl<I, F, Fut> IterClone for ForEachAsyncAm<I, F, Fut>
+impl<I, F, Fut> InnerIter for ForEachAsyncAm<I, F, Fut>
 where
     I: DistributedIterator + 'static,
     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
     Fut: Future<Output = ()> + Send + 'static,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ForEachAsyncAm {
             op: self.op.clone(),
             iter: self.iter.iter_clone(Sealed),
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index 5cbac26b..bb68a949 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -1,8 +1,8 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::one_sided_iterator::OneSidedIterator;
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{ArrayOps, Distribution, UnsafeArray};
 use crate::barrier::BarrierHandle;
@@ -26,8 +26,11 @@ pub(crate) struct Reduce<I, F> {
     pub(crate) op: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for Reduce<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for Reduce<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Reduce {
             iter: self.iter.iter_clone(Sealed),
             op: self.op.clone(),
@@ -47,7 +50,7 @@ where
     type Handle = InnerDistIterReduceHandle<I::Item, F>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Reduce {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             op: self.op.clone(),
         }
     }
@@ -220,14 +223,18 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     pub(crate) fn new(
-        barrier: BarrierHandle,
+        lock: Option<IterLockFuture>,
         reqs: Pin<Box<dyn Future<Output = InnerDistIterReduceHandle<T, F>> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
+        let state = match lock {
+            Some(inner_lock) => State::Lock(inner_lock, Some(reqs)),
+            None => State::Barrier(array.barrier_handle(), reqs),
+        };
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Barrier(barrier, reqs),
+            state,
         }
     }
 
@@ -255,6 +262,10 @@ where
 
 #[pin_project(project = StateProj)]
 enum State<T, F> {
+    Lock(
+        #[pin] IterLockFuture,
+        Option<Pin<Box<dyn Future<Output = InnerDistIterReduceHandle<T, F>> + Send>>>,
+    ),
     Barrier(
         #[pin] BarrierHandle,
         Pin<Box<dyn Future<Output = InnerDistIterReduceHandle<T, F>> + Send>>,
@@ -272,6 +283,16 @@ where
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
+            StateProj::Lock(lock, inner) => {
+                ready!(lock.poll(cx));
+                let barrier = this.array.barrier_handle();
+                *this.state = State::Barrier(
+                    barrier,
+                    inner.take().expect("reqs should still be in this state"),
+                );
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
             StateProj::Barrier(barrier, inner) => {
                 ready!(barrier.poll(cx));
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
@@ -299,8 +320,11 @@ pub(crate) struct ReduceAm<I, F> {
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I: IterClone, F: Clone> IterClone for ReduceAm<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for ReduceAm<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ReduceAm {
             op: self.op.clone(),
             iter: self.iter.iter_clone(Sealed),
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index 89bcb812..a575739e 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::LamellarArcLocalAm;
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::distributed_iterator::DistributedIterator;
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{ArrayOps, Distribution, UnsafeArray};
 use crate::barrier::BarrierHandle;
@@ -23,8 +23,11 @@ pub(crate) struct Sum<I> {
     pub(crate) iter: I,
 }
 
-impl<I: IterClone> IterClone for Sum<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Sum<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Sum {
             iter: self.iter.iter_clone(Sealed),
         }
@@ -42,7 +45,7 @@ where
     type Handle = InnerDistIterSumHandle<I::Item>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Sum {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
         }
     }
     fn next(&mut self) -> Option<Self::Item> {
@@ -170,38 +173,6 @@ where
         }
     }
 }
-//#[doc(hidden)]
-// impl<T> LamellarRequest for InnerDistIterSumHandle<T>
-// where
-//     T: Dist + ArrayOps + std::iter::Sum,
-// {
-//     fn blocking_wait(mut self) -> Self::Output {
-//         let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
-//         let local_sum = self
-//             .reqs
-//             .drain(..)
-//             .map(|req| req.blocking_wait())
-//             .into_iter()
-//             .sum();
-//         self.reduce_remote_vals(local_sum, local_sums)
-//     }
-
-//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-//         for req in self.reqs.iter_mut() {
-//             if !req.ready_or_set_waker(waker) {
-//                 //only need to wait on the next unready req
-//                 return false;
-//             }
-//         }
-//         true
-//     }
-
-//     fn val(&self) -> Self::Output {
-//         let local_sums = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
-//         let local_sum = self.reqs.iter().map(|req| req.val()).into_iter().sum();
-//         self.reduce_remote_vals(local_sum, local_sums)
-//     }
-// }
 
 #[pin_project(PinnedDrop)]
 pub struct DistIterSumHandle<T> {
@@ -229,14 +200,18 @@ where
     T: Dist + ArrayOps + std::iter::Sum,
 {
     pub(crate) fn new(
-        barrier_handle: BarrierHandle,
+        lock: Option<IterLockFuture>,
         inner: Pin<Box<dyn Future<Output = InnerDistIterSumHandle<T>> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
+        let state = match lock {
+            Some(inner_lock) => State::Lock(inner_lock, Some(inner)),
+            None => State::Barrier(array.barrier_handle(), inner),
+        };
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Barrier(barrier_handle, inner),
+            state,
         }
     }
 
@@ -264,6 +239,10 @@ where
 
 #[pin_project(project = StateProj)]
 enum State<T> {
+    Lock(
+        #[pin] IterLockFuture,
+        Option<Pin<Box<dyn Future<Output = InnerDistIterSumHandle<T>> + Send>>>,
+    ),
     Barrier(
         #[pin] BarrierHandle,
         Pin<Box<dyn Future<Output = InnerDistIterSumHandle<T>> + Send>>,
@@ -279,7 +258,18 @@ where
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
         self.launched = true;
         let mut this = self.project();
+
         match this.state.as_mut().project() {
+            StateProj::Lock(lock, inner) => {
+                ready!(lock.poll(cx));
+                let barrier = this.array.barrier_handle();
+                *this.state = State::Barrier(
+                    barrier,
+                    inner.take().expect("reqs should still be in this state"),
+                );
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
             StateProj::Barrier(barrier, inner) => {
                 ready!(barrier.poll(cx));
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
@@ -300,56 +290,17 @@ where
     }
 }
 
-//#[doc(hidden)]
-// impl<T> LamellarRequest for DistIterSumHandle<T>
-// where
-//     T: Dist + ArrayOps + std::iter::Sum,
-// {
-//     fn blocking_wait(mut self) -> Self::Output {
-//         self.launched = true;
-//         let state = std::mem::replace(&mut self.state, State::Dropped);
-//         match state {
-//             State::Barrier(barrier, reqs) => {
-//                 barrier.blocking_wait();
-//                 self.team.block_on(reqs).blocking_wait()
-//             }
-//             State::Reqs(inner) => inner.blocking_wait(),
-//             State::Dropped => panic!("called `blocking_wait` on a future that was dropped"),
-//         }
-//     }
-//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-//         self.launched = true;
-//         match &mut self.state {
-//             State::Barrier(barrier, _) => {
-//                 if !barrier.ready_or_set_waker(waker) {
-//                     return false;
-//                 }
-//                 waker.wake_by_ref();
-//                 false
-//             }
-//             State::Reqs(inner) => inner.ready_or_set_waker(waker),
-//             State::Dropped => panic!("called `ready_or_set_waker` on a future that was dropped"),
-//         }
-//     }
-//     fn val(&self) -> Self::Output {
-//         match &self.state {
-//             State::Barrier(_barrier, _reqs) => {
-//                 unreachable!("should never be in barrier state when val is called");
-//             }
-//             State::Reqs(inner) => inner.val(),
-//             State::Dropped => panic!("called `val` on a future that was dropped"),
-//         }
-//     }
-// }
-
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct SumAm<I> {
     pub(crate) iter: Sum<I>,
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I: IterClone> IterClone for SumAm<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for SumAm<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         SumAm {
             iter: self.iter.iter_clone(Sealed),
             schedule: self.schedule.clone(),
diff --git a/src/array/iterator/distributed_iterator/enumerate.rs b/src/array/iterator/distributed_iterator/enumerate.rs
index b4ece0d6..14946b11 100644
--- a/src/array/iterator/distributed_iterator/enumerate.rs
+++ b/src/array/iterator/distributed_iterator/enumerate.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Enumerate<I> {
@@ -6,8 +6,11 @@ pub struct Enumerate<I> {
     cur_index: usize,
 }
 
-impl<I: IterClone> IterClone for Enumerate<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Enumerate<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Enumerate {
             iter: self.iter.iter_clone(Sealed),
             cur_index: self.cur_index,
@@ -31,8 +34,8 @@ where
 {
     type Item = (usize, <I as DistributedIterator>::Item);
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Enumerate<I> {
-        let iter = self.iter.init(start_i, cnt);
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Enumerate<I> {
+        let iter = self.iter.init(start_i, cnt, _s);
         let val = Enumerate::new(iter, start_i);
         // println!("{:?} Enumerate init {start_i} {cnt} {start_i}",std::thread::current().id());
         val
diff --git a/src/array/iterator/distributed_iterator/filter.rs b/src/array/iterator/distributed_iterator/filter.rs
index 417a6f7d..8b8330b9 100644
--- a/src/array/iterator/distributed_iterator/filter.rs
+++ b/src/array/iterator/distributed_iterator/filter.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Filter<I, F> {
@@ -6,8 +6,11 @@ pub struct Filter<I, F> {
     f: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for Filter<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for Filter<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Filter {
             iter: self.iter.iter_clone(Sealed),
             f: self.f.clone(),
@@ -32,8 +35,8 @@ where
 {
     type Item = I::Item;
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Filter<I, F> {
-        let val = Filter::new(self.iter.init(start_i, cnt), self.f.clone());
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Filter<I, F> {
+        let val = Filter::new(self.iter.init(start_i, cnt, _s), self.f.clone());
         // println!("{:?} Filter init {start_i} {cnt}",std::thread::current().id());
         val
     }
diff --git a/src/array/iterator/distributed_iterator/filter_map.rs b/src/array/iterator/distributed_iterator/filter_map.rs
index 9c168799..f44b1062 100644
--- a/src/array/iterator/distributed_iterator/filter_map.rs
+++ b/src/array/iterator/distributed_iterator/filter_map.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct FilterMap<I, F> {
@@ -6,8 +6,11 @@ pub struct FilterMap<I, F> {
     f: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for FilterMap<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for FilterMap<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         FilterMap {
             iter: self.iter.iter_clone(Sealed),
             f: self.f.clone(),
@@ -54,9 +57,9 @@ where
 {
     type Item = B;
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> FilterMap<I, F> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> FilterMap<I, F> {
         // println!("init enumerate start_i: {:?} cnt {:?} end_i {:?}",start_i, cnt, start_i+cnt );
-        FilterMap::new(self.iter.init(start_i, cnt), self.f.clone())
+        FilterMap::new(self.iter.init(start_i, cnt, _s), self.f.clone())
     }
     fn array(&self) -> Self::Array {
         self.iter.array()
diff --git a/src/array/iterator/distributed_iterator/map.rs b/src/array/iterator/distributed_iterator/map.rs
index c2293d89..a664b959 100644
--- a/src/array/iterator/distributed_iterator/map.rs
+++ b/src/array/iterator/distributed_iterator/map.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Map<I, F> {
@@ -6,8 +6,11 @@ pub struct Map<I, F> {
     f: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for Map<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for Map<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Map {
             iter: self.iter.iter_clone(Sealed),
             f: self.f.clone(),
@@ -33,9 +36,9 @@ where
 {
     type Item = B;
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Map<I, F> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Map<I, F> {
         // println!("init enumerate start_i: {:?} cnt {:?} end_i {:?}",start_i, cnt, start_i+cnt );
-        Map::new(self.iter.init(start_i, cnt), self.f.clone())
+        Map::new(self.iter.init(start_i, cnt, _s), self.f.clone())
     }
     fn array(&self) -> Self::Array {
         self.iter.array()
diff --git a/src/array/iterator/distributed_iterator/monotonic.rs b/src/array/iterator/distributed_iterator/monotonic.rs
index fad72be4..d250fb37 100644
--- a/src/array/iterator/distributed_iterator/monotonic.rs
+++ b/src/array/iterator/distributed_iterator/monotonic.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Monotonic<I> {
@@ -6,8 +6,11 @@ pub struct Monotonic<I> {
     cur_index: usize,
 }
 
-impl<I: IterClone> IterClone for Monotonic<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Monotonic<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Monotonic {
             iter: self.iter.iter_clone(Sealed),
             cur_index: self.cur_index,
@@ -31,8 +34,8 @@ where
 {
     type Item = (usize, <I as DistributedIterator>::Item);
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Monotonic<I> {
-        let val = Monotonic::new(self.iter.init(start_i, cnt), start_i);
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Monotonic<I> {
+        let val = Monotonic::new(self.iter.init(start_i, cnt, _s), start_i);
         // println!("{:?} Monotonic init {start_i} {cnt} {start_i}",std::thread::current().id());
         val
     }
diff --git a/src/array/iterator/distributed_iterator/skip.rs b/src/array/iterator/distributed_iterator/skip.rs
index 281e323e..ca3aff9c 100644
--- a/src/array/iterator/distributed_iterator/skip.rs
+++ b/src/array/iterator/distributed_iterator/skip.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 //skips the first n elements of iterator I per pe (this implys that n * num_pes elements are skipd in total)
 #[derive(Clone, Debug)]
@@ -8,8 +8,11 @@ pub struct Skip<I> {
     skip_index: usize,
 }
 
-impl<I: IterClone> IterClone for Skip<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Skip<I> {
+fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+            None
+        }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Skip {
             iter: self.iter.iter_clone(Sealed),
             count: self.count,
@@ -38,8 +41,8 @@ where
 {
     type Item = <I as DistributedIterator>::Item;
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, in_start_i: usize, len: usize) -> Skip<I> {
-        let mut iter = self.iter.init(in_start_i, len);
+    fn init(&self, in_start_i: usize, len: usize, _s: Sealed) -> Skip<I> {
+        let mut iter = self.iter.init(in_start_i, len, _s);
         let mut skip_index = in_start_i;
 
         //now we need to see how many elements to skip
diff --git a/src/array/iterator/distributed_iterator/step_by.rs b/src/array/iterator/distributed_iterator/step_by.rs
index 1102b0e2..4692bb47 100644
--- a/src/array/iterator/distributed_iterator/step_by.rs
+++ b/src/array/iterator/distributed_iterator/step_by.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 //skips the first n elements of iterator I per pe (this implys that n * num_pes elements are skipd in total)
 #[derive(Clone, Debug)]
@@ -8,8 +8,11 @@ pub struct StepBy<I> {
     add_one: usize, //if we dont align perfectly we will need to add 1 to our iteration index calculation
 }
 
-impl<I: IterClone> IterClone for StepBy<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for StepBy<I> {
+fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+            None
+        }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         StepBy {
             iter: self.iter.iter_clone(Sealed),
             step_size: self.step_size,
@@ -38,10 +41,10 @@ where
 {
     type Item = <I as DistributedIterator>::Item;
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, in_start_i: usize, cnt: usize) -> StepBy<I> {
+    fn init(&self, in_start_i: usize, cnt: usize, _s: Sealed) -> StepBy<I> {
         let mut iter = self
             .iter
-            .init(in_start_i * self.step_size, cnt * self.step_size);
+            .init(in_start_i * self.step_size, cnt * self.step_size, _s);
         let mut offset_index = 0;
 
         // make sure we start from a valid step interval element
diff --git a/src/array/iterator/distributed_iterator/take.rs b/src/array/iterator/distributed_iterator/take.rs
index b4054a30..279a6d74 100644
--- a/src/array/iterator/distributed_iterator/take.rs
+++ b/src/array/iterator/distributed_iterator/take.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::distributed_iterator::*;
+use crate::array::iterator::{distributed_iterator::*, IterLockFuture};
 
 //skips the first n elements of iterator I per pe (this implys that n * num_pes elements are skipd in total)
 #[derive(Clone, Debug)]
@@ -8,8 +8,11 @@ pub struct Take<I> {
     cur_index: usize,
 }
 
-impl<I: IterClone> IterClone for Take<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Take<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Take {
             iter: self.iter.iter_clone(Sealed),
             count: self.count,
@@ -57,9 +60,9 @@ where
 {
     type Item = <I as DistributedIterator>::Item;
     type Array = <I as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, len: usize) -> Take<I> {
+    fn init(&self, start_i: usize, len: usize, _s: Sealed) -> Take<I> {
         // println!("init take start_i: {:?} cnt: {:?} count: {:?}",start_i, cnt,self.count);
-        let val = Take::new(self.iter.init(start_i, len), self.count, start_i);
+        let val = Take::new(self.iter.init(start_i, len, _s), self.count, start_i);
         // println!("{:?} Take init {start_i} {len} {:?} {start_i}",std::thread::current().id(),self.count);
         val
     }
diff --git a/src/array/iterator/distributed_iterator/zip.rs b/src/array/iterator/distributed_iterator/zip.rs
index 47f2c1aa..1bb3bfc7 100644
--- a/src/array/iterator/distributed_iterator/zip.rs
+++ b/src/array/iterator/distributed_iterator/zip.rs
@@ -6,8 +6,11 @@ pub struct Zip<A, B> {
     b: B,
 }
 
-impl<A: IterClone, B: IterClone> IterClone for Zip<A, B> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<A: InnerIter, B: InnerIter> InnerIter for Zip<A, B> {
+fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+            None
+        }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Zip {
             a: self.a.clone(),
             b: self.b.clone(),
@@ -72,9 +75,9 @@ where
         <B as DistributedIterator>::Item,
     );
     type Array = <A as DistributedIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Zip<A, B> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Zip<A, B> {
         // println!("init zip start_i: {:?} cnt {:?} end_i {:?}",start_i, cnt, start_i+cnt );
-        Zip::new(self.a.init(start_i, cnt), self.b.init(start_i, cnt))
+        Zip::new(self.a.init(start_i, cnt,_s), self.b.init(start_i, cnt,_s))
     }
     fn array(&self) -> Self::Array {
         self.a.array()
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index cd719d13..50fff246 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -48,6 +48,8 @@ use std::marker::PhantomData;
 use std::pin::Pin;
 use std::sync::Arc;
 
+use super::IterLockFuture;
+
 macro_rules! consumer_impl {
     ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$($return_type: tt)*]; [$($bounds:tt)+] ) => {
         fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $($return_type)*
@@ -147,7 +149,7 @@ pub trait LocalIteratorLauncher: InnerArray {
 /// The functions in this trait are available on all local iterators.
 /// Additonaly functionality can be found in the [IndexedLocalIterator] trait:
 /// these methods are only available for local iterators where the number of elements is known in advance (e.g. after invoking `filter` these methods would be unavailable)
-pub trait LocalIterator: SyncSend + IterClone + 'static {
+pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// The type of item this local iterator produces
     type Item: Send;
 
@@ -157,7 +159,8 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
     /// Internal method used to initalize this local iterator to the correct element and correct length.
     ///
     /// Because we know the number of elements of the array on each PE we can specify the index to start from.
-    fn init(&self, start_i: usize, cnt: usize) -> Self;
+    #[doc(hidden)]
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self;
 
     /// Return the original array this local iterator belongs too
     fn array(&self) -> Self::Array;
@@ -732,7 +735,7 @@ pub trait LocalIterator: SyncSend + IterClone + 'static {
 }
 
 /// An interface for dealing with local iterators which are indexable, meaning it returns an iterator of known length
-pub trait IndexedLocalIterator: LocalIterator + SyncSend + IterClone + 'static {
+pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     /// yields the local (to the calling PE) index along with each element
     ///
     /// # Examples
@@ -979,8 +982,11 @@ pub struct LocalIter<'a, T: Dist + 'static, A: LamellarArray<T>> {
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist, A: LamellarArray<T>> IterClone for LocalIter<'a, T, A> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist, A: LamellarArray<T>> InnerIter for LocalIter<'a, T, A> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -1021,7 +1027,7 @@ impl<
 {
     type Item = &'static T;
     type Array = A;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init local_iterator start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?} {:?}",start_i,cnt, start_i+cnt,max_i,std::thread::current().id());
 
@@ -1095,8 +1101,11 @@ pub struct LocalIterMut<'a, T: Dist, A: LamellarArray<T>> {
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist, A: LamellarArray<T>> IterClone for LocalIterMut<'a, T, A> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist, A: LamellarArray<T>> InnerIter for LocalIterMut<'a, T, A> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalIterMut {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -1136,7 +1145,7 @@ impl<
 {
     type Item = &'static mut T;
     type Array = A;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("{:?} LocalIter init {start_i} {cnt} {} {}",std::thread::current().id(), start_i+cnt,max_i);
         LocalIterMut {
diff --git a/src/array/iterator/local_iterator/chunks.rs b/src/array/iterator/local_iterator/chunks.rs
index 0d5a07c3..7fcf0b27 100644
--- a/src/array/iterator/local_iterator/chunks.rs
+++ b/src/array/iterator/local_iterator/chunks.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Chunks<I> {
@@ -8,8 +8,11 @@ pub struct Chunks<I> {
     chunk_size: usize,
 }
 
-impl<I: IterClone> IterClone for Chunks<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Chunks<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Chunks {
             iter: self.iter.iter_clone(Sealed),
             cur_i: self.cur_i,
@@ -39,10 +42,13 @@ where
 {
     type Item = Chunk<I>;
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Chunks<I> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Chunks<I> {
         Chunks::new(
-            self.iter
-                .init(start_i * self.chunk_size, (start_i + cnt) * self.chunk_size),
+            self.iter.init(
+                start_i * self.chunk_size,
+                (start_i + cnt) * self.chunk_size,
+                _s,
+            ),
             start_i,
             cnt,
             self.chunk_size,
@@ -54,7 +60,10 @@ where
     fn next(&mut self) -> Option<Self::Item> {
         if self.cur_i < self.end_i {
             let start_i = self.cur_i * self.chunk_size;
-            let iter = self.iter.iter_clone(Sealed).init(start_i, self.chunk_size);
+            let iter = self
+                .iter
+                .iter_clone(Sealed)
+                .init(start_i, self.chunk_size, Sealed);
             let chunk = Chunk { iter: iter };
             self.cur_i += 1;
             Some(chunk)
diff --git a/src/array/iterator/local_iterator/consumer.rs b/src/array/iterator/local_iterator/consumer.rs
index 57bcec62..406017f1 100644
--- a/src/array/iterator/local_iterator/consumer.rs
+++ b/src/array/iterator/local_iterator/consumer.rs
@@ -9,268 +9,3 @@ pub(crate) use count::*;
 pub(crate) use for_each::*;
 pub(crate) use reduce::*;
 pub(crate) use sum::*;
-
-// use crate::active_messaging::LamellarArcLocalAm;
-// use crate::lamellar_request::LamellarRequest;
-// use crate::lamellar_team::LamellarTeamRT;
-// use crate::array::iterator::local_iterator::{LocalIterator,IterRequest,Monotonic};
-
-// use std::sync::Arc;
-// use std::sync::atomic::{AtomicUsize,Ordering};
-// use std::pin::Pin;
-// use parking_lot::Mutex;
-// use rand::thread_rng;
-// use rand::prelude::SliceRandom;
-
-// #[derive(Clone, Debug)]
-// pub(crate) struct IterWorkStealer {
-//     pub(crate) range: Arc<Mutex<(usize, usize)>>, //start, end
-// }
-
-// impl IterWorkStealer {
-//     fn set_range(&self, start: usize, end: usize) {
-//         let mut range = self.range.lock();
-//         range.0 = start;
-//         range.1 = end;
-//     }
-
-//     fn next(&self) -> Option<usize> {
-//         let mut range = self.range.lock();
-//         let index = range.0;
-//         range.0 += 1;
-//         if range.0 <= range.1 {
-//             Some(index)
-//         } else {
-//             None
-//         }
-//     }
-//     fn set_done(&self) {
-//         let mut range = self.range.lock();
-//         range.0 = range.1;
-//     }
-
-//     fn steal(&self) -> Option<(usize, usize)> {
-//         let mut range = self.range.lock();
-//         let start = range.0;
-//         let end = range.1;
-//         if end > start && end - start > 2 {
-//             let new_end = (start + end) / 2;
-//             range.1 = new_end;
-//             Some((new_end, end))
-//         } else {
-//             None
-//         }
-//     }
-// }
-
-// #[derive(Clone, Debug)]
-// pub(crate) enum IterSchedule{
-//     Static(usize,usize),
-//     Dynamic(Arc<AtomicUsize>,usize),
-//     Chunk(Vec<(usize, usize)>, Arc<AtomicUsize>,),
-//     WorkStealing(IterWorkStealer, Vec<IterWorkStealer>)
-// }
-
-// impl IterSchedule {
-//     fn init_iter<I: LocalIterator>(&self, iter: I) -> IterScheduleIter<I> {
-//         match self {
-//             IterSchedule::Static( start, end) => {
-//                 IterScheduleIter::Static(iter.init(*start,end-start))
-//             }
-//             IterSchedule::Dynamic(cur_i, max_i) => {
-//                 IterScheduleIter::Dynamic(iter, cur_i.clone(), *max_i)
-//             }
-//             IterSchedule::Chunk(ranges, range_i) => {
-//                 IterScheduleIter::Chunk(iter.init(0,0), ranges.clone(),range_i.clone())
-//             }
-//             IterSchedule::WorkStealing( range, siblings) => {
-//                 let (start, end) = *range.range.lock();
-//                 IterScheduleIter::WorkStealing(iter.init(start, end-start), range.clone(), siblings.clone())
-//             }
-//         }
-//     }
-//     fn monotonic_iter<I: LocalIterator>(&self, iter: I) -> IterScheduleIter<Monotonic<I>> {
-//         match self {
-//             IterSchedule::Static(start, end) => {
-//                 IterScheduleIter::Static(iter.monotonic().init(*start,end-start))
-//             }
-//             IterSchedule::Dynamic(cur_i, max_i) => {
-//                 IterScheduleIter::Dynamic(iter.monotonic(), cur_i.clone(), *max_i)
-//             }
-//             IterSchedule::Chunk(ranges, range_i) => {
-//                 IterScheduleIter::Chunk(iter.monotonic().init(0,0), ranges.clone(),range_i.clone())
-//             }
-//             IterSchedule::WorkStealing(range, siblings) => {
-//                 let (start, end) = *range.range.lock();
-//                 IterScheduleIter::WorkStealing(iter.monotonic().init(start, end-start), range.clone(), siblings.clone())            }
-//         }
-//     }
-// }
-
-// pub(crate) enum IterScheduleIter<I>{
-//     Static(I),
-//     Dynamic(I,Arc<AtomicUsize>,usize),
-//     Chunk(I,Vec<(usize, usize)>, Arc<AtomicUsize>),
-//     WorkStealing(I,IterWorkStealer, Vec<IterWorkStealer>)
-// }
-
-// impl<I: LocalIterator> Iterator for IterScheduleIter<I> {
-//     type Item = I::Item;
-//     fn next(&mut self) -> Option<Self::Item> {
-//         match self {
-//             IterScheduleIter::Static(iter) => {
-//                 iter.next()
-//             }
-//             IterScheduleIter::Dynamic(iter, cur_i, max_i) => {
-//                 let mut ci = cur_i.fetch_add(1, Ordering::Relaxed);
-//                 while ci < *max_i {
-//                     // println!("ci {:?} maxi {:?} {:?}", ci, *max_i, std::thread::current().id());
-//                     *iter = iter.init(ci,1);
-//                     if let Some(elem) = iter.next() {
-//                         return Some(elem);
-//                     }
-//                     ci = cur_i.fetch_add(1, Ordering::Relaxed);
-//                 }
-//                 None
-//             }
-//             IterScheduleIter::Chunk(iter, ranges, range_i) => {
-//                 let mut next = iter.next();
-//                 // println!("next {:?} {:?}", next.is_none(), std::thread::current().id());
-//                 if next.is_none(){
-//                     let ri = range_i.fetch_add(1, Ordering::Relaxed);
-//                     // println!("range {:?} {:?}", ri, std::thread::current().id());
-//                     if ri < ranges.len() {
-//                         *iter = iter.init(ranges[ri].0, ranges[ri].1-ranges[ri].0);
-//                         next = iter.next();
-//                     }
-//                 }
-//                 next
-//             }
-//             IterScheduleIter::WorkStealing(iter, range, siblings) => {
-//                 let mut inner_next = |iter: &mut I| {
-//                     while let Some(ri) = range.next(){
-//                         *iter = iter.init(ri,1);
-//                         if let Some(elem) = iter.next() {
-//                             return Some(elem);
-//                         }
-//                         // else{
-//                         //     range.set_done();
-//                         // }
-//                     }
-//                     None
-//                 };
-//                 let mut next = inner_next(iter);
-//                 if next.is_none() {
-//                     let mut rng = thread_rng();
-//                     let mut workers = (0..siblings.len()).collect::<Vec<usize>>();
-//                     workers.shuffle(&mut rng);
-//                     if let Some(worker) = workers.pop() {
-//                         if let Some((start, end)) = siblings[worker].steal() {
-//                             *iter = iter.init(start, end - start);
-//                             range.set_range(start, end);
-//                             next = inner_next(iter);
-//                         }
-//                     }
-//                 }
-//                 next
-//             }
-//         }
-//     }
-// }
-
-// pub(crate) trait IterConsumer{
-//     type AmOutput;
-//     type Output;
-//     fn into_am(&self, schedule: IterSchedule) -> LamellarArcLocalAm;
-//     fn create_handle(self, team: Pin<Arc<LamellarTeamRT>>, reqs: Vec<Box<dyn LamellarRequest<Output = Self::AmOutput>>>) -> Box<dyn IterRequest<Output = Self::Output>>;
-//     fn max_elems(&self, in_elems: usize) -> usize;
-// }
-
-// // #[derive(Clone, Debug)]
-// // pub(crate) enum IterConsumer<I,A,T,F,R>{
-// //     Collect(Distribution,PhantomData<A>),
-// //     Count,
-// //     ForEach(F),
-// //     Reduce(R),
-// // }
-
-// // impl<I,A,T,F,R> IterConsumer<I,A,T,F,R> where
-// //     I: LocalIterator + 'static,
-// //     I::Item: SyncSend,
-// //     A: From<UnsafeArray<T>> + SyncSend,
-// //     T: Dist + ArrayOps
-// //     F: Fn(I::Item) + SyncSend + Clone + 'static,
-// //     R: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,{
-
-// //     fn into_am<Am>(self, schedule: IterSchedule<I>) -> Am
-// //     where
-// //         A: LamellarActiveMessage + LocalAM + 'static,{
-// //         match self {
-// //             IterConsumer::Collect(_) => {
-// //                 CollectAm{
-// //                     schedule
-// //                 }
-// //             }
-// //             IterConsumer::Count => {
-// //                 CountAm{
-// //                     schedule
-// //                 }
-// //             }
-// //             IterConsumer::ForEach(op) => {
-// //                 ForEachAm{
-// //                     op,
-// //                     schedule,
-// //                 }
-// //             }
-// //             IterConsumer::Reduce(op) => {
-// //                 ReduceAm{
-// //                     op,
-// //                     schedule,
-// //                 }
-// //             }
-// //         }
-// //     }
-
-// //     fn create_handle<O>(self, team: Pin<Arc<LamellarTeamRT>>, reqs: Vec<Box<dyn LamellarRequest<Output = O>>) -> IterConsumerHandle<A,T,F,R>{
-// //         match self {
-// //             IterConsumer::Collect(dist,phantom) => {
-// //                 IterConsumerHandle::Collect(LocalIterCollectHandle{
-// //                     reqs: reqs,
-// //                     distribution: dist,
-// //                     team: team,
-// //                     _phantom: phantom,
-// //                 })
-// //             }
-// //             IterConsumer::Count => {
-// //                 IterConsumerHandle::Count(LocalIterCountHandle{
-// //                     reqs: reqs,
-// //                 })
-// //             }
-// //             IterConsumer::ForEach(_) => {
-// //                 IterConsumerHandle::ForEach(LocalIterForEachHandle{
-// //                     reqs: reqs,
-// //                 })
-// //             }
-// //             IterConsumer::Reduce(op) => {
-// //                 IterConsumerHandle::Reduce(LocalIterReduceHandle::<I::Item,R>{
-// //                     reqs:reqs,
-// //                     op: op
-// //                 })
-// //             }
-// //         }
-// //     }
-// // }
-
-// // pub(crate) enum IterConsumerHandle<A,T,F>{
-// //     Collect(LocalIterCollectHandle<T,A>),
-// //     Count(LocalIterCountHandle),
-// //     ForEach(LocalIterForEachHandle),
-// //     Reduce(LocalIterReduceHandle<T,F>)
-// // }
-
-// // #[async_trait]
-// // impl<A,T,F> IterConsumerHandle<I,A,T,F,R> where
-// //     A: From<UnsafeArray<T>> + SyncSend,
-// //     T: Dist + ArrayOps
-// //     F: Fn(I::Item) + SyncSend + Clone + 'static,
-// //     R: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static,{
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 761f325d..8bdb385b 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::{LocalIterator, Monotonic};
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::operations::ArrayOps;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
@@ -27,8 +27,11 @@ pub(crate) struct Collect<I, A> {
     pub(crate) _phantom: PhantomData<A>,
 }
 
-impl<I: IterClone, A> IterClone for Collect<I, A> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, A> InnerIter for Collect<I, A> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Collect {
             iter: self.iter.iter_clone(Sealed),
             distribution: self.distribution.clone(),
@@ -49,7 +52,7 @@ where
     type Handle = InnerLocalIterCollectHandle<I::Item, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Collect {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             distribution: self.distribution.clone(),
             _phantom: self._phantom.clone(),
         }
@@ -89,8 +92,11 @@ pub(crate) struct CollectAsync<I, A, B> {
     pub(crate) _phantom: PhantomData<(A, B)>,
 }
 
-impl<I: IterClone, A, B> IterClone for CollectAsync<I, A, B> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, A, B> InnerIter for CollectAsync<I, A, B> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         CollectAsync {
             iter: self.iter.iter_clone(Sealed),
             distribution: self.distribution.clone(),
@@ -112,7 +118,7 @@ where
     type Handle = InnerLocalIterCollectHandle<B, A>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         CollectAsync {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             distribution: self.distribution.clone(),
             _phantom: self._phantom.clone(),
         }
@@ -266,13 +272,14 @@ where
     A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static,
 {
     pub(crate) fn new(
+        lock: Option<IterLockFuture>,
         inner: Pin<Box<dyn Future<Output = InnerLocalIterCollectHandle<T, A>> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Init(inner),
+            state: State::Init(lock, inner),
         }
     }
 
@@ -300,7 +307,10 @@ where
 
 #[pin_project(project = StateProj)]
 enum State<T, A> {
-    Init(Pin<Box<dyn Future<Output = InnerLocalIterCollectHandle<T, A>> + Send>>),
+    Init(
+        Option<IterLockFuture>,
+        Pin<Box<dyn Future<Output = InnerLocalIterCollectHandle<T, A>> + Send>>,
+    ),
     Reqs(#[pin] InnerLocalIterCollectHandle<T, A>),
     Dropped,
 }
@@ -314,7 +324,10 @@ where
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
-            StateProj::Init(inner) => {
+            StateProj::Init(lock, inner) => {
+                if let Some(lock) = lock {
+                    ready!(lock.as_mut().poll(cx));
+                }
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
                 match Pin::new(&mut inner).poll(cx) {
                     Poll::Ready(val) => Poll::Ready(val),
@@ -339,8 +352,11 @@ pub(crate) struct CollectAm<I, A> {
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I: IterClone, A> IterClone for CollectAm<I, A> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, A> InnerIter for CollectAm<I, A> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         CollectAm {
             iter: self.iter.iter_clone(Sealed),
             schedule: self.schedule.clone(),
diff --git a/src/array/iterator/local_iterator/consumer/count.rs b/src/array/iterator/local_iterator/consumer/count.rs
index 6fdc1275..5cee6bd2 100644
--- a/src/array/iterator/local_iterator/consumer/count.rs
+++ b/src/array/iterator/local_iterator/consumer/count.rs
@@ -1,6 +1,6 @@
 use crate::active_messaging::LamellarArcLocalAm;
 use crate::array::iterator::local_iterator::LocalIterator;
-use crate::array::iterator::{consumer::*, private::*};
+use crate::array::iterator::{consumer::*, private::*, IterLockFuture};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
@@ -20,8 +20,11 @@ pub(crate) struct Count<I> {
     pub(crate) iter: I,
 }
 
-impl<I: IterClone> IterClone for Count<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Count<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Count {
             iter: self.iter.iter_clone(Sealed),
         }
@@ -38,7 +41,7 @@ where
     type Handle = InnerLocalIterCountHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Count {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
         }
     }
     fn next(&mut self) -> Option<Self::Item> {
@@ -126,13 +129,14 @@ impl PinnedDrop for LocalIterCountHandle {
 
 impl LocalIterCountHandle {
     pub(crate) fn new(
+        lock: Option<IterLockFuture>,
         inner: Pin<Box<dyn Future<Output = InnerLocalIterCountHandle> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Init(inner),
+            state: State::Init(lock, inner),
         }
     }
 
@@ -160,7 +164,10 @@ impl LocalIterCountHandle {
 
 #[pin_project(project = StateProj)]
 enum State {
-    Init(Pin<Box<dyn Future<Output = InnerLocalIterCountHandle> + Send>>),
+    Init(
+        Option<IterLockFuture>,
+        Pin<Box<dyn Future<Output = InnerLocalIterCountHandle> + Send>>,
+    ),
     Reqs(#[pin] InnerLocalIterCountHandle),
     Dropped,
 }
@@ -170,7 +177,10 @@ impl Future for LocalIterCountHandle {
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
-            StateProj::Init(inner) => {
+            StateProj::Init(lock, inner) => {
+                if let Some(lock) = lock {
+                    ready!(lock.as_mut().poll(cx));
+                }
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
                 match Pin::new(&mut inner).poll(cx) {
                     Poll::Ready(val) => Poll::Ready(val),
@@ -195,11 +205,14 @@ pub(crate) struct CountAm<I> {
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I> IterClone for CountAm<I>
+impl<I> InnerIter for CountAm<I>
 where
-    I: IterClone,
+    I: InnerIter,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         CountAm {
             iter: self.iter.iter_clone(Sealed),
             schedule: self.schedule.clone(),
diff --git a/src/array/iterator/local_iterator/consumer/for_each.rs b/src/array/iterator/local_iterator/consumer/for_each.rs
index d0eae122..8900b2a0 100644
--- a/src/array/iterator/local_iterator/consumer/for_each.rs
+++ b/src/array/iterator/local_iterator/consumer/for_each.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
@@ -26,12 +26,15 @@ where
     pub(crate) op: F,
 }
 
-impl<I, F> IterClone for ForEach<I, F>
+impl<I, F> InnerIter for ForEach<I, F>
 where
     I: LocalIterator + 'static,
     F: Fn(I::Item) + SyncSend + Clone + 'static,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ForEach {
             iter: self.iter.iter_clone(Sealed),
             op: self.op.clone(),
@@ -51,7 +54,7 @@ where
     fn init(&self, start: usize, cnt: usize) -> Self {
         // println!("ForEach before init start {:?} cnt {:?}", start,cnt);
         let iter = ForEach {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             op: self.op.clone(),
         };
         // println!("ForEach after init start {:?} cnt {:?}", start,cnt);
@@ -94,13 +97,16 @@ where
     // pub(crate) _phantom: PhantomData<Fut>,
 }
 
-impl<I, F, Fut> IterClone for ForEachAsync<I, F, Fut>
+impl<I, F, Fut> InnerIter for ForEachAsync<I, F, Fut>
 where
     I: LocalIterator + 'static,
     F: Fn(I::Item) -> Fut + SyncSend + Clone + 'static,
     Fut: Future<Output = ()> + Send + 'static,
 {
-    fn iter_clone(&self, _: Sealed) -> Self {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ForEachAsync {
             iter: self.iter.iter_clone(Sealed),
             op: self.op.clone(),
@@ -120,7 +126,7 @@ where
     type Handle = InnerLocalIterForEachHandle;
     fn init(&self, start: usize, cnt: usize) -> Self {
         ForEachAsync {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             op: self.op.clone(),
         }
     }
@@ -213,13 +219,14 @@ impl PinnedDrop for LocalIterForEachHandle {
 
 impl LocalIterForEachHandle {
     pub(crate) fn new(
+        lock: Option<IterLockFuture>,
         reqs: Pin<Box<dyn Future<Output = InnerLocalIterForEachHandle> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
         LocalIterForEachHandle {
             array: array.clone(),
             launched: false,
-            state: State::Init(reqs),
+            state: State::Init(lock, reqs),
         }
     }
 
@@ -247,7 +254,10 @@ impl LocalIterForEachHandle {
 
 #[pin_project(project = StateProj)]
 enum State {
-    Init(Pin<Box<dyn Future<Output = InnerLocalIterForEachHandle> + Send>>),
+    Init(
+        Option<IterLockFuture>,
+        Pin<Box<dyn Future<Output = InnerLocalIterForEachHandle> + Send>>,
+    ),
     Reqs(#[pin] InnerLocalIterForEachHandle),
     Dropped,
 }
@@ -257,7 +267,10 @@ impl Future for LocalIterForEachHandle {
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
-            StateProj::Init(inner) => {
+            StateProj::Init(lock, inner) => {
+                if let Some(lock) = lock {
+                    ready!(lock.as_mut().poll(cx));
+                }
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
                 match Pin::new(&mut inner).poll(cx) {
                     Poll::Ready(()) => Poll::Ready(()),
diff --git a/src/array/iterator/local_iterator/consumer/reduce.rs b/src/array/iterator/local_iterator/consumer/reduce.rs
index 8f41f5ff..f812dd18 100644
--- a/src/array/iterator/local_iterator/consumer/reduce.rs
+++ b/src/array/iterator/local_iterator/consumer/reduce.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
@@ -22,8 +22,11 @@ pub(crate) struct Reduce<I, F> {
     pub(crate) op: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for Reduce<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for Reduce<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Reduce {
             iter: self.iter.iter_clone(Sealed),
             op: self.op.clone(),
@@ -43,7 +46,7 @@ where
     type Handle = InnerLocalIterReduceHandle<I::Item, F>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Reduce {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
             op: self.op.clone(),
         }
     }
@@ -150,13 +153,14 @@ where
     F: Fn(T, T) -> T + SyncSend + Clone + 'static,
 {
     pub(crate) fn new(
+        lock: Option<IterLockFuture>,
         reqs: Pin<Box<dyn Future<Output = InnerLocalIterReduceHandle<T, F>> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Init(reqs),
+            state: State::Init(lock, reqs),
         }
     }
 
@@ -186,7 +190,10 @@ where
 
 #[pin_project(project = StateProj)]
 enum State<T, F> {
-    Init(Pin<Box<dyn Future<Output = InnerLocalIterReduceHandle<T, F>> + Send>>),
+    Init(
+        Option<IterLockFuture>,
+        Pin<Box<dyn Future<Output = InnerLocalIterReduceHandle<T, F>> + Send>>,
+    ),
     Reqs(#[pin] InnerLocalIterReduceHandle<T, F>),
     Dropped,
 }
@@ -200,7 +207,10 @@ where
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
-            StateProj::Init(inner) => {
+            StateProj::Init(lock, inner) => {
+                if let Some(lock) = lock {
+                    ready!(lock.as_mut().poll(cx));
+                }
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
                 match Pin::new(&mut inner).poll(cx) {
                     Poll::Ready(val) => Poll::Ready(val),
@@ -226,8 +236,11 @@ pub(crate) struct ReduceAm<I, F> {
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I: IterClone, F: Clone> IterClone for ReduceAm<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for ReduceAm<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ReduceAm {
             op: self.op.clone(),
             iter: self.iter.iter_clone(Sealed),
diff --git a/src/array/iterator/local_iterator/consumer/sum.rs b/src/array/iterator/local_iterator/consumer/sum.rs
index 6f5ed5a6..9857c525 100644
--- a/src/array/iterator/local_iterator/consumer/sum.rs
+++ b/src/array/iterator/local_iterator/consumer/sum.rs
@@ -1,7 +1,7 @@
 use crate::active_messaging::{LamellarArcLocalAm, SyncSend};
-use crate::array::iterator::consumer::*;
 use crate::array::iterator::local_iterator::LocalIterator;
 use crate::array::iterator::private::*;
+use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
@@ -21,8 +21,11 @@ pub(crate) struct Sum<I> {
     pub(crate) iter: I,
 }
 
-impl<I: IterClone> IterClone for Sum<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Sum<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Sum {
             iter: self.iter.iter_clone(Sealed),
         }
@@ -40,7 +43,7 @@ where
     type Handle = InnerLocalIterSumHandle<I::Item>;
     fn init(&self, start: usize, cnt: usize) -> Self {
         Sum {
-            iter: self.iter.init(start, cnt),
+            iter: self.iter.init(start, cnt, Sealed),
         }
     }
     fn next(&mut self) -> Option<Self::Item> {
@@ -115,34 +118,6 @@ where
         }
     }
 }
-
-//#[doc(hidden)]
-// impl<T> LamellarRequest for InnerLocalIterSumHandle<T>
-// where
-//     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
-// {
-//     fn blocking_wait(mut self) -> Self::Output {
-//         self.reqs
-//             .drain(..)
-//             .map(|req| req.blocking_wait())
-//             .sum::<Self::Output>()
-//     }
-
-//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-//         for req in self.reqs.iter_mut() {
-//             if !req.ready_or_set_waker(waker) {
-//                 //only need to wait on the next unready req
-//                 return false;
-//             }
-//         }
-//         true
-//     }
-
-//     fn val(&self) -> Self::Output {
-//         self.reqs.iter().map(|req| req.val()).sum::<Self::Output>()
-//     }
-// }
-
 #[pin_project(PinnedDrop)]
 pub struct LocalIterSumHandle<T> {
     array: UnsafeArrayInner,
@@ -169,13 +144,14 @@ where
     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
 {
     pub(crate) fn new(
+        lock: Option<IterLockFuture>,
         inner: Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>,
         array: &UnsafeArrayInner,
     ) -> Self {
         Self {
             array: array.clone(),
             launched: false,
-            state: State::Init(inner),
+            state: State::Init(lock, inner),
         }
     }
 
@@ -203,7 +179,10 @@ where
 
 #[pin_project(project = StateProj)]
 enum State<T> {
-    Init(Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>),
+    Init(
+        Option<IterLockFuture>,
+        Pin<Box<dyn Future<Output = InnerLocalIterSumHandle<T>> + Send>>,
+    ),
     Reqs(#[pin] InnerLocalIterSumHandle<T>),
     Dropped,
 }
@@ -216,7 +195,10 @@ where
         self.launched = true;
         let mut this = self.project();
         match this.state.as_mut().project() {
-            StateProj::Init(inner) => {
+            StateProj::Init(lock, inner) => {
+                if let Some(lock) = lock {
+                    ready!(lock.as_mut().poll(cx));
+                }
                 let mut inner = ready!(Future::poll(inner.as_mut(), cx));
                 match Pin::new(&mut inner).poll(cx) {
                     Poll::Ready(val) => Poll::Ready(val),
@@ -235,50 +217,17 @@ where
     }
 }
 
-//#[doc(hidden)]
-// impl<T> LamellarRequest for LocalIterSumHandle<T>
-// where
-//     T: SyncSend + for<'a> std::iter::Sum<&'a T> + std::iter::Sum<T> + 'static,
-// {
-//     fn blocking_wait(mut self) -> Self::Output {
-//         self.launched = true;
-//         let state = std::mem::replace(&mut self.state, State::Dropped);
-//         match state {
-//             State::Init(reqs) => self.team.block_on(reqs).blocking_wait(),
-//             State::Reqs(inner) => inner.blocking_wait(),
-//             State::Dropped => panic!("called `blocking_wait` on a future that was dropped"),
-//         }
-//     }
-//     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
-//         self.launched = true;
-//         match &mut self.state {
-//             State::Init(_) => {
-//                 waker.wake_by_ref();
-//                 false
-//             }
-//             State::Reqs(inner) => inner.ready_or_set_waker(waker),
-//             State::Dropped => panic!("called `ready_or_set_waker` on a future that was dropped"),
-//         }
-//     }
-//     fn val(&self) -> Self::Output {
-//         match &self.state {
-//             State::Init(_reqs) => {
-//                 unreachable!("should never be in init state when val is called");
-//             }
-//             State::Reqs(inner) => inner.val(),
-//             State::Dropped => panic!("called `val` on a future that was dropped"),
-//         }
-//     }
-// }
-
 #[lamellar_impl::AmLocalDataRT(Clone)]
 pub(crate) struct SumAm<I> {
     pub(crate) iter: Sum<I>,
     pub(crate) schedule: IterSchedule,
 }
 
-impl<I: IterClone> IterClone for SumAm<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for SumAm<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         SumAm {
             iter: self.iter.iter_clone(Sealed),
             schedule: self.schedule.clone(),
diff --git a/src/array/iterator/local_iterator/enumerate.rs b/src/array/iterator/local_iterator/enumerate.rs
index ef100b84..5a5a29d0 100644
--- a/src/array/iterator/local_iterator/enumerate.rs
+++ b/src/array/iterator/local_iterator/enumerate.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Enumerate<I> {
@@ -6,8 +6,11 @@ pub struct Enumerate<I> {
     cur_index: usize,
 }
 
-impl<I: IterClone> IterClone for Enumerate<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Enumerate<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Enumerate {
             iter: self.iter.iter_clone(Sealed),
             cur_index: self.cur_index,
@@ -31,8 +34,8 @@ where
 {
     type Item = (usize, <I as LocalIterator>::Item);
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Enumerate<I> {
-        let val = Enumerate::new(self.iter.init(start_i, cnt), start_i);
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Enumerate<I> {
+        let val = Enumerate::new(self.iter.init(start_i, cnt, _s), start_i);
         // println!("{:?} Enumerate init {start_i} {cnt} {start_i}",std::thread::current().id());
         val
     }
diff --git a/src/array/iterator/local_iterator/filter.rs b/src/array/iterator/local_iterator/filter.rs
index b13ae8f2..fa07d379 100644
--- a/src/array/iterator/local_iterator/filter.rs
+++ b/src/array/iterator/local_iterator/filter.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Filter<I, F> {
@@ -6,8 +6,11 @@ pub struct Filter<I, F> {
     f: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for Filter<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for Filter<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Filter {
             iter: self.iter.iter_clone(Sealed),
             f: self.f.clone(),
@@ -32,9 +35,9 @@ where
 {
     type Item = I::Item;
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Filter<I, F> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Filter<I, F> {
         // println!("{:?} Filter init before {start_i} {cnt}",std::thread::current().id());
-        let val = Filter::new(self.iter.init(start_i, cnt), self.f.clone());
+        let val = Filter::new(self.iter.init(start_i, cnt, _s), self.f.clone());
         // println!("{:?} Filter init after {start_i} {cnt}",std::thread::current().id());
 
         val
diff --git a/src/array/iterator/local_iterator/filter_map.rs b/src/array/iterator/local_iterator/filter_map.rs
index d8e8fb5f..1cbf04e4 100644
--- a/src/array/iterator/local_iterator/filter_map.rs
+++ b/src/array/iterator/local_iterator/filter_map.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct FilterMap<I, F> {
@@ -6,8 +6,11 @@ pub struct FilterMap<I, F> {
     f: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for FilterMap<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for FilterMap<I, F> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         FilterMap {
             iter: self.iter.iter_clone(Sealed),
             f: self.f.clone(),
@@ -54,9 +57,9 @@ where
 {
     type Item = B;
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> FilterMap<I, F> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> FilterMap<I, F> {
         // println!("init enumerate start_i: {:?} cnt {:?} end_i {:?}",start_i, cnt, start_i+cnt );
-        FilterMap::new(self.iter.init(start_i, cnt), self.f.clone())
+        FilterMap::new(self.iter.init(start_i, cnt, _s), self.f.clone())
     }
     fn array(&self) -> Self::Array {
         self.iter.array()
diff --git a/src/array/iterator/local_iterator/map.rs b/src/array/iterator/local_iterator/map.rs
index 4f7e4b21..079ba86b 100644
--- a/src/array/iterator/local_iterator/map.rs
+++ b/src/array/iterator/local_iterator/map.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Map<I, F> {
@@ -6,8 +6,11 @@ pub struct Map<I, F> {
     f: F,
 }
 
-impl<I: IterClone, F: Clone> IterClone for Map<I, F> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter, F: Clone> InnerIter for Map<I, F> {
+fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+            None
+        }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Map {
             iter: self.iter.iter_clone(Sealed),
             f: self.f.clone(),
@@ -32,8 +35,8 @@ where
 {
     type Item = B;
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Map<I, F> {
-        Map::new(self.iter.init(start_i, cnt), self.f.clone())
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Map<I, F> {
+        Map::new(self.iter.init(start_i, cnt,_s), self.f.clone())
     }
     fn array(&self) -> Self::Array {
         self.iter.array()
@@ -74,8 +77,8 @@ where
 // {
 //     type Item = B;
 //     type Array = I::Array;
-//     fn init(&self, start_i: usize, cnt: usize) -> MapIndexed<I, F> {
-//         MapIndexed::new(self.iter.init(start_i, cnt), self.f.clone())
+//     fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> MapIndexed<I, F> {
+//         MapIndexed::new(self.iter.init(start_i, cnt,_s), self.f.clone())
 //     }
 //     fn array(&self) -> Self::Array {
 //         self.iter.array()
diff --git a/src/array/iterator/local_iterator/monotonic.rs b/src/array/iterator/local_iterator/monotonic.rs
index 5415bd94..9f2ae917 100644
--- a/src/array/iterator/local_iterator/monotonic.rs
+++ b/src/array/iterator/local_iterator/monotonic.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Monotonic<I> {
@@ -6,8 +6,11 @@ pub struct Monotonic<I> {
     cur_index: usize,
 }
 
-impl<I: IterClone> IterClone for Monotonic<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Monotonic<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Monotonic {
             iter: self.iter.iter_clone(Sealed),
             cur_index: self.cur_index,
@@ -30,8 +33,8 @@ where
 {
     type Item = (usize, <I as LocalIterator>::Item);
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Monotonic<I> {
-        let val = Monotonic::new(self.iter.init(start_i, cnt), start_i);
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Monotonic<I> {
+        let val = Monotonic::new(self.iter.init(start_i, cnt, _s), start_i);
         // println!("{:?} Monotonic init {start_i} {cnt} {start_i}",std::thread::current().id());
         val
     }
diff --git a/src/array/iterator/local_iterator/skip.rs b/src/array/iterator/local_iterator/skip.rs
index f1f32094..80d6aedc 100644
--- a/src/array/iterator/local_iterator/skip.rs
+++ b/src/array/iterator/local_iterator/skip.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 //skips the first n elements of iterator I per pe (this implys that n * num_pes elements are skipd in total)
 #[derive(Clone, Debug)]
@@ -8,8 +8,11 @@ pub struct Skip<I> {
     skip_offset: usize,
 }
 
-impl<I: IterClone> IterClone for Skip<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Skip<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Skip {
             iter: self.iter.iter_clone(Sealed),
             skip_count: self.skip_count,
@@ -38,8 +41,8 @@ where
 {
     type Item = <I as LocalIterator>::Item;
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, in_start_i: usize, in_cnt: usize) -> Skip<I> {
-        let mut iter = self.iter.init(in_start_i, in_cnt);
+    fn init(&self, in_start_i: usize, in_cnt: usize, _s: Sealed) -> Skip<I> {
+        let mut iter = self.iter.init(in_start_i, in_cnt, _s);
         let start_i = std::cmp::max(in_start_i, self.skip_count);
         let advance = std::cmp::min(start_i - in_start_i, in_cnt);
 
diff --git a/src/array/iterator/local_iterator/step_by.rs b/src/array/iterator/local_iterator/step_by.rs
index 6d080dd7..c94867b8 100644
--- a/src/array/iterator/local_iterator/step_by.rs
+++ b/src/array/iterator/local_iterator/step_by.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 //skips the first n elements of iterator I per pe (this implys that n * num_pes elements are skipd in total)
 #[derive(Clone, Debug)]
@@ -8,8 +8,11 @@ pub struct StepBy<I> {
     add_one: usize, //if we dont align perfectly we will need to add 1 to our iteration index calculation
 }
 
-impl<I: IterClone> IterClone for StepBy<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for StepBy<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         StepBy {
             iter: self.iter.iter_clone(Sealed),
             step_size: self.step_size,
@@ -37,10 +40,10 @@ where
 {
     type Item = <I as LocalIterator>::Item;
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, in_start_i: usize, cnt: usize) -> StepBy<I> {
+    fn init(&self, in_start_i: usize, cnt: usize, _s: Sealed) -> StepBy<I> {
         let mut iter = self
             .iter
-            .init(in_start_i * self.step_size, cnt * self.step_size);
+            .init(in_start_i * self.step_size, cnt * self.step_size, _s);
         let mut offset_index = 0;
 
         // make sure we start from a valid step interval element
diff --git a/src/array/iterator/local_iterator/take.rs b/src/array/iterator/local_iterator/take.rs
index 97de9e9e..8540c32d 100644
--- a/src/array/iterator/local_iterator/take.rs
+++ b/src/array/iterator/local_iterator/take.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 //skips the first n elements of iterator I per pe (this implys that n * num_pes elements are skipd in total)
 #[derive(Clone, Debug)]
@@ -7,8 +7,11 @@ pub struct Take<I> {
     take_count: usize,
 }
 
-impl<I: IterClone> IterClone for Take<I> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<I: InnerIter> InnerIter for Take<I> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        self.iter.lock_if_needed(_s)
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Take {
             iter: self.iter.iter_clone(Sealed),
             take_count: self.take_count,
@@ -31,11 +34,11 @@ where
 {
     type Item = <I as LocalIterator>::Item;
     type Array = <I as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Take<I> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Take<I> {
         let start_i = std::cmp::min(start_i, self.take_count);
         let end_i = std::cmp::min(start_i + cnt, self.take_count);
         let len = end_i - start_i;
-        Take::new(self.iter.init(start_i, len), self.take_count)
+        Take::new(self.iter.init(start_i, len, _s), self.take_count)
     }
     fn array(&self) -> Self::Array {
         self.iter.array()
diff --git a/src/array/iterator/local_iterator/zip.rs b/src/array/iterator/local_iterator/zip.rs
index 61cad7f3..e95d415b 100644
--- a/src/array/iterator/local_iterator/zip.rs
+++ b/src/array/iterator/local_iterator/zip.rs
@@ -1,4 +1,4 @@
-use crate::array::iterator::local_iterator::*;
+use crate::array::iterator::{local_iterator::*, IterLockFuture};
 
 #[derive(Clone, Debug)]
 pub struct Zip<A, B> {
@@ -6,8 +6,20 @@ pub struct Zip<A, B> {
     b: B,
 }
 
-impl<A: IterClone, B: IterClone> IterClone for Zip<A, B> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<A: InnerIter, B: InnerIter> InnerIter for Zip<A, B> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        let futa = self.a.lock_if_needed(_s);
+        let futb = self.b.lock_if_needed(_s);
+        match (futa, futb) {
+            (None, None) => None,
+            (Some(futa), None) => Some(futa),
+            (None, Some(futb)) => Some(futb),
+            (Some(futa), Some(futb)) => Some(Box::pin(async move {
+                let _ = futures_util::future::join(futa, futb).await;
+            })),
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         Zip {
             a: self.a.iter_clone(Sealed),
             b: self.b.iter_clone(Sealed),
@@ -32,9 +44,9 @@ where
 {
     type Item = (<A as LocalIterator>::Item, <B as LocalIterator>::Item);
     type Array = <A as LocalIterator>::Array;
-    fn init(&self, start_i: usize, cnt: usize) -> Zip<A, B> {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Zip<A, B> {
         // println!("init zip start_i: {:?} cnt {:?} end_i {:?}",start_i, cnt, start_i+cnt );
-        Zip::new(self.a.init(start_i, cnt), self.b.init(start_i, cnt))
+        Zip::new(self.a.init(start_i, cnt, _s), self.b.init(start_i, cnt, _s))
     }
     fn array(&self) -> Self::Array {
         self.a.array()
diff --git a/src/array/iterator/mod.rs b/src/array/iterator/mod.rs
index 57d59234..6ad0e92e 100644
--- a/src/array/iterator/mod.rs
+++ b/src/array/iterator/mod.rs
@@ -1,5 +1,7 @@
 //! Provides various iterator types for LamellarArrays
 pub mod distributed_iterator;
+use std::pin::Pin;
+
 use distributed_iterator::DistributedIterator;
 pub mod local_iterator;
 use local_iterator::LocalIterator;
@@ -17,10 +19,16 @@ use crate::memregion::Dist;
 //     fn wait(self: Box<Self>) -> Self::Output;
 // }
 
+pub(crate) type IterLockFuture = Pin<Box<dyn std::future::Future<Output = ()> + Send>>;
 pub(crate) mod private {
+    use super::IterLockFuture;
+
+    #[derive(Debug, Clone, Copy)]
     pub struct Sealed;
-    pub trait IterClone: Sized {
-        fn iter_clone(&self, _: Sealed) -> Self;
+
+    pub trait InnerIter: Sized {
+        fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture>;
+        fn iter_clone(&self, _s: Sealed) -> Self;
     }
 }
 
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index daacc4bc..1a1e0642 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -474,7 +474,8 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
         //     self.buf_0.len(),
         //     self.array.len()
         // );
-        let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+        let mut req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+        req.launch();
         self.state = State::Pending(req);
     }
 
@@ -761,14 +762,16 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T> + Clone + Send>
             if self.index + self.buf_0.len() < self.array.len() {
                 // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
                 // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
-                let req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+                let mut req = unsafe { self.array.internal_get(self.index, &self.buf_0) };
+                req.launch();
                 self.state = State::Pending(req);
             } else {
                 let sub_region = self.buf_0.sub_region(0..(self.array.len() - self.index));
                 // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
                 // but safe with respect to the buf_0 as we have consumed all its content and self is the only reference
                 // sub_region is set to the remaining size of the array so we will not have an out of bounds issue
-                let req = unsafe { self.array.internal_get(self.index, sub_region) };
+                let mut req = unsafe { self.array.internal_get(self.index, sub_region) };
+                req.launch();
                 self.state = State::Pending(req);
             }
         }
diff --git a/src/array/iterator/one_sided_iterator/chunks.rs b/src/array/iterator/one_sided_iterator/chunks.rs
index 0dac6bb2..40f53660 100644
--- a/src/array/iterator/one_sided_iterator/chunks.rs
+++ b/src/array/iterator/one_sided_iterator/chunks.rs
@@ -53,7 +53,8 @@ where
             array.team_rt().alloc_one_sided_mem_region(size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
         // but safe with respect to the mem_region as this is the only reference
-        let req = unsafe { array.internal_get(index, &mem_region) };
+        let mut req = unsafe { array.internal_get(index, &mem_region) };
+        req.launch();
         (mem_region, req)
     }
 }
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 891ee6e8..11a30585 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -980,7 +980,8 @@ impl<T: Dist + AmDist> LocalLockArrayReduceHandle<T> {
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     #[must_use = "this function returns a future used to poll for completion and retrieve the result. Call '.await' on the future otherwise, if  it is ignored (via ' let _ = *.spawn()') or dropped the only way to ensure completion is calling 'wait_all()' on the world or array. Alternatively it may be acceptable to call '.block()' instead of 'spawn()'"]
-    pub fn spawn(self) -> LamellarTask<Option<T>> {
+    pub fn spawn(mut self) -> LamellarTask<Option<T>> {
+        self.req.launch();
         self.lock_guard.array.clone().spawn(self)
     }
 
@@ -996,6 +997,9 @@ impl<T: Dist + AmDist> LocalLockArrayReduceHandle<T> {
 }
 
 impl<T: Dist + AmDist> LamellarRequest for LocalLockArrayReduceHandle<T> {
+    fn launch(&mut self) {
+        self.req.launch();
+    }
     fn blocking_wait(self) -> Self::Output {
         self.req.blocking_wait()
     }
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index 4ac62d9d..d4942409 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -1,3 +1,5 @@
+use parking_lot::Mutex;
+
 use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
@@ -6,9 +8,11 @@ use crate::array::local_lock_atomic::*;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
-use crate::darc::local_rw_darc::LocalRwDarcWriteGuard;
+use crate::darc::local_rw_darc::{LocalRwDarcReadHandle, LocalRwDarcWriteGuard};
 use crate::memregion::Dist;
 
+use self::iterator::IterLockFuture;
+
 impl<T> InnerArray for LocalLockArray<T> {
     fn as_inner(&self) -> &UnsafeArrayInner {
         &self.array.inner
@@ -19,15 +23,33 @@ impl<T> InnerArray for LocalLockArray<T> {
 #[derive(Clone)]
 pub struct LocalLockDistIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    // lock: Arc<LocalRwDarcReadGuard<()>>,
-    lock: Arc<LocalRwDarcReadGuard<()>>,
+    lock: Arc<Mutex<Option<LocalRwDarcReadGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist> IterClone for LocalLockDistIter<'a, T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist> InnerIter for LocalLockDistIter<'a, T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        // println!(
+        //     " LocalLockDistIter lock_if_needed: {:?}",
+        //     std::thread::current().id()
+        // );
+        if self.lock.lock().is_none() {
+            // println!("LocalLockDistIter need to get read handle");
+            let lock_handle = self.data.lock.read();
+            let lock = self.lock.clone();
+
+            Some(Box::pin(async move {
+                // println!("LocalLockDistIter trying to get read handle");
+                *lock.lock() = Some(lock_handle.await);
+                // println!("LocalLockDistIter got the read lock");
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalLockDistIter {
             data: self.data.clone(),
             lock: self.lock.clone(),
@@ -54,14 +76,33 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIter<'a, T> {
 #[derive(Clone)]
 pub struct LocalLockLocalIter<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<LocalRwDarcReadGuard<()>>,
+    lock: Arc<Mutex<Option<LocalRwDarcReadGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist> IterClone for LocalLockLocalIter<'a, T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist> InnerIter for LocalLockLocalIter<'a, T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        // println!(
+        //     " LocalLockLocalIter lock_if_needed: {:?}",
+        //     std::thread::current().id()
+        // );
+        if self.lock.lock().is_none() {
+            // println!("LocalLockLocalIter need to get read handle");
+            let lock_handle = self.data.lock.read();
+            let lock = self.lock.clone();
+
+            Some(Box::pin(async move {
+                // println!("LocalLockLocalIter trying to get read handle");
+                *lock.lock() = Some(lock_handle.await);
+                // println!("LocalLockLocalIter got the read lock");
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalLockLocalIter {
             data: self.data.clone(),
             lock: self.lock.clone(),
@@ -87,7 +128,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockLocalIter<'a, T> {
 impl<T: Dist + 'static> DistributedIterator for LocalLockDistIter<'static, T> {
     type Item = &'static T;
     type Array = LocalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         LocalLockDistIter {
@@ -132,7 +173,7 @@ impl<T: Dist + 'static> IndexedDistributedIterator for LocalLockDistIter<'static
 impl<T: Dist + 'static> LocalIterator for LocalLockLocalIter<'static, T> {
     type Item = &'static T;
     type Array = LocalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         LocalLockLocalIter {
@@ -181,14 +222,33 @@ impl<T: Dist + 'static> IndexedLocalIterator for LocalLockLocalIter<'static, T>
 
 pub struct LocalLockDistIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<LocalRwDarcWriteGuard<()>>,
+    lock: Arc<Mutex<Option<LocalRwDarcWriteGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist> IterClone for LocalLockDistIterMut<'a, T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist> InnerIter for LocalLockDistIterMut<'a, T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        // println!(
+        //     " LocalLockDistIterMut lock_if_needed: {:?}",
+        //     std::thread::current().id()
+        // );
+        if self.lock.lock().is_none() {
+            // println!("LocalLockDistIterMut need to get write handle");
+            let lock_handle = self.data.lock.write();
+            let lock = self.lock.clone();
+
+            Some(Box::pin(async move {
+                // println!("LocalLockDistIterMut trying to get write handle");
+                *lock.lock() = Some(lock_handle.await);
+                // println!("LocalLockDistIterMut got the write lock");
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalLockDistIterMut {
             data: self.data.clone(),
             lock: self.lock.clone(),
@@ -213,14 +273,33 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockDistIterMut<'a, T> {
 
 pub struct LocalLockLocalIterMut<'a, T: Dist> {
     data: LocalLockArray<T>,
-    lock: Arc<LocalRwDarcWriteGuard<()>>,
+    lock: Arc<Mutex<Option<LocalRwDarcWriteGuard<()>>>>,
     cur_i: usize,
     end_i: usize,
     _marker: PhantomData<&'a T>,
 }
 
-impl<'a, T: Dist> IterClone for LocalLockLocalIterMut<'a, T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<'a, T: Dist> InnerIter for LocalLockLocalIterMut<'a, T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        // println!(
+        //     " LocalLockLocalIterMut lock_if_needed: {:?}",
+        //     std::thread::current().id()
+        // );
+        if self.lock.lock().is_none() {
+            // println!("LocalLockLocalIterMut need to get write handle");
+            let lock_handle = self.data.lock.write();
+            let lock = self.lock.clone();
+
+            Some(Box::pin(async move {
+                // println!("LocalLockLocalIterMut trying to get write handle");
+                *lock.lock() = Some(lock_handle.await);
+                // println!("LocalLockLocalIterMut got the write lock");
+            }))
+        } else {
+            None
+        }
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalLockLocalIterMut {
             data: self.data.clone(),
             lock: self.lock.clone(),
@@ -246,7 +325,7 @@ impl<'a, T: Dist> std::fmt::Debug for LocalLockLocalIterMut<'a, T> {
 impl<T: Dist + 'static> DistributedIterator for LocalLockDistIterMut<'static, T> {
     type Item = &'static mut T;
     type Array = LocalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         LocalLockDistIterMut {
@@ -295,7 +374,7 @@ impl<T: Dist + 'static> IndexedDistributedIterator for LocalLockDistIterMut<'sta
 impl<T: Dist + 'static> LocalIterator for LocalLockLocalIterMut<'static, T> {
     type Item = &'static mut T;
     type Array = LocalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         LocalLockLocalIterMut {
@@ -354,11 +433,11 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
         // let the_array: LocalLockArray<T> = self.clone();
         // let lock: LocalRwDarc<()> = self.lock.clone();
         // let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
-        let lock = Arc::new(self.lock.read().block());
-        self.barrier();
+        // let lock = Arc::new(self.lock.read().block());
+        // self.barrier();
         LocalLockDistIter {
             data: self.clone(),
-            lock: lock,
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -368,10 +447,10 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     fn local_iter(&self) -> Self::LocalIter {
         // let lock: LocalRwDarc<()> = self.lock.clone();
         // let lock = Arc::new(self.array.block_on(async move { lock.read().await }));
-        let lock = Arc::new(self.lock.read().block());
+        // let lock = Arc::new(self.lock.read().block());
         LocalLockLocalIter {
             data: self.clone(),
-            lock: lock,
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -398,12 +477,12 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     fn dist_iter_mut(&self) -> Self::DistIter {
         // let lock: LocalRwDarc<()> = self.lock.clone();
         // let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
-        let lock = Arc::new(self.lock.write().block());
+        // let lock = Arc::new(self.lock.write().block());
         // self.barrier();
         // println!("dist_iter thread {:?} got lock",std::thread::current().id());
         LocalLockDistIterMut {
             data: self.clone(),
-            lock: lock,
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
@@ -413,12 +492,13 @@ impl<T: Dist> LamellarArrayMutIterators<T> for LocalLockArray<T> {
     fn local_iter_mut(&self) -> Self::LocalIter {
         // println!("trying to get write lock for iter");
         // let lock: LocalRwDarc<()> = self.lock.clone();
-        // let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
-        let lock = Arc::new(self.lock.write().block());
+        // // let lock = Arc::new(self.array.block_on(async move { lock.write().await }));
+        // let lock = Arc::new(self.lock.write().block());
+
         // println!("got write lock for iter");
         LocalLockLocalIterMut {
             data: self.clone(),
-            lock: lock,
+            lock: Arc::new(Mutex::new(None)),
             cur_i: 0,
             end_i: 0,
             _marker: PhantomData,
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index 15830a95..fc7aafa5 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -7,6 +7,8 @@ use crate::memregion::Dist;
 
 use std::sync::Arc;
 
+use self::iterator::IterLockFuture;
+
 /// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of a [LocalLockArray]
 /// This struct is created by awaiting or blocking on the handle returned by [LocalLockArray::read_local_chunks]
 #[derive(Clone)]
@@ -18,8 +20,11 @@ pub struct LocalLockLocalChunks<T: Dist> {
     pub(crate) lock_guard: Arc<LocalRwDarcReadGuard<()>>,
 }
 
-impl<T: Dist> IterClone for LocalLockLocalChunks<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for LocalLockLocalChunks<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalLockLocalChunks {
             chunk_size: self.chunk_size,
             index: self.index,
@@ -43,8 +48,11 @@ pub struct LocalLockLocalChunksMut<T: Dist> {
     pub(crate) lock_guard: Arc<LocalRwDarcWriteGuard<()>>,
 }
 
-impl<T: Dist> IterClone for LocalLockLocalChunksMut<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for LocalLockLocalChunksMut<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         LocalLockLocalChunksMut {
             chunk_size: self.chunk_size,
             index: self.index,
@@ -84,7 +92,7 @@ impl<T: Dist> DerefMut for LocalLockMutChunkLocalData<'_, T> {
 impl<T: Dist> LocalIterator for LocalLockLocalChunks<T> {
     type Item = LocalLockLocalData<T>;
     type Array = LocalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
         let end_i = std::cmp::min(
             (start_i + cnt) * self.chunk_size,
@@ -150,7 +158,7 @@ impl<T: Dist> IndexedLocalIterator for LocalLockLocalChunks<T> {
 impl<T: Dist + 'static> LocalIterator for LocalLockLocalChunksMut<T> {
     type Item = LocalLockMutChunkLocalData<'static, T>;
     type Array = LocalLockArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let end_i = std::cmp::min(
             (start_i + cnt) * self.chunk_size,
             self.array.num_elems_local(),
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index 796cb170..3b3cf9d1 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -125,7 +125,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -235,7 +235,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -288,7 +288,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                        reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index 26306db8..85228558 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -2,13 +2,15 @@ use crate::array::iterator::distributed_iterator::*;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::one_sided_iterator::OneSidedIter;
 use crate::array::iterator::{
-    private::{IterClone, Sealed},
+    private::{InnerIter, Sealed},
     LamellarArrayIterators, LamellarArrayMutIterators,
 };
 use crate::array::native_atomic::*;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
 use crate::memregion::Dist;
+
+use self::iterator::IterLockFuture;
 // use parking_lot::{
 //     lock_api::{RwLockReadGuardArc, RwLockWriteGuardArc},
 //     RawRwLock,
@@ -28,8 +30,11 @@ pub struct NativeAtomicDistIter<T: Dist> {
     end_i: usize,
 }
 
-impl<T: Dist> IterClone for NativeAtomicDistIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for NativeAtomicDistIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         NativeAtomicDistIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -58,8 +63,11 @@ pub struct NativeAtomicLocalIter<T: Dist> {
     end_i: usize,
 }
 
-impl<T: Dist> IterClone for NativeAtomicLocalIter<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for NativeAtomicLocalIter<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         NativeAtomicLocalIter {
             data: self.data.clone(),
             cur_i: self.cur_i,
@@ -83,7 +91,7 @@ impl<T: Dist> std::fmt::Debug for NativeAtomicLocalIter<T> {
 impl<T: Dist> DistributedIterator for NativeAtomicDistIter<T> {
     type Item = NativeAtomicElement<T>;
     type Array = NativeAtomicArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init dist iter start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?}",start_i,cnt, start_i+cnt,max_i);
         // println!("num_elems_local: {:?}",self.data.num_elems_local());
@@ -132,7 +140,7 @@ impl<T: Dist> IndexedDistributedIterator for NativeAtomicDistIter<T> {
 impl<T: Dist> LocalIterator for NativeAtomicLocalIter<T> {
     type Item = NativeAtomicElement<T>;
     type Array = NativeAtomicArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         let max_i = self.data.num_elems_local();
         // println!("init native_atomic start_i: {:?} cnt {:?} end_i: {:?} max_i: {:?} {:?}",start_i,cnt, start_i+cnt,max_i,std::thread::current().id());
         NativeAtomicLocalIter {
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index e2dc2a98..84553794 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -121,7 +121,7 @@ impl<T: Dist + 'static> LamellarAm for InitGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.array.inner.distribution {
@@ -234,7 +234,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                                     [cur_index..(cur_index + u8_buf_len)]
                                     .to_vec(),
                             };
-                            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                             cur_index += u8_buf_len;
                         } else {
                             panic!("this should not be possible");
@@ -287,7 +287,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                             len: self.buf.len(),
                             data: vec,
                         };
-                        reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+                        reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
                     }
                 }
             }
diff --git a/src/array/read_only/local_chunks.rs b/src/array/read_only/local_chunks.rs
index e8f34bf3..0a881af9 100644
--- a/src/array/read_only/local_chunks.rs
+++ b/src/array/read_only/local_chunks.rs
@@ -4,6 +4,8 @@ use crate::array::read_only::*;
 use crate::array::LamellarArray;
 use crate::memregion::Dist;
 
+use self::iterator::IterLockFuture;
+
 /// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of an [ReadOnlyArray]
 /// This struct is created by calling [ReadOnlyArray::local_chunks]
 #[derive(Clone)]
@@ -14,8 +16,11 @@ pub struct ReadOnlyLocalChunks<T: Dist> {
     array: ReadOnlyArray<T>,
 }
 
-impl<T: Dist> IterClone for ReadOnlyLocalChunks<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for ReadOnlyLocalChunks<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         ReadOnlyLocalChunks {
             chunk_size: self.chunk_size,
             index: self.index,
@@ -28,7 +33,7 @@ impl<T: Dist> IterClone for ReadOnlyLocalChunks<T> {
 impl<T: Dist + 'static> LocalIterator for ReadOnlyLocalChunks<T> {
     type Item = &'static [T];
     type Array = ReadOnlyArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
         let end_i = std::cmp::min(
             (start_i + cnt) * self.chunk_size,
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 4c550946..67eff7cf 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1441,7 +1441,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     ///     let req = array.local_iter().for_each(move |_| {
     ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
     ///         let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
-    ///     });
+    ///     }).spawn();
     /// }
     /// array.wait_all();
     /// array.barrier();
@@ -1479,7 +1479,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     ///     let req = array.local_iter().for_each(move |_| {
     ///         let index = rand::thread_rng().gen_range(0..array_clone.len());
     ///         let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
-    ///     });
+    ///     }).spawn();
     /// }
     /// array.wait_all();
     /// array.barrier();
@@ -2146,7 +2146,7 @@ impl UnsafeArrayInner {
         }
     }
 
-    fn barrier_handle(&self) -> BarrierHandle {
+    pub(crate) fn barrier_handle(&self) -> BarrierHandle {
         self.data.team.barrier.barrier_handle()
     }
 }
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 0f8c054b..d09ecd19 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -28,7 +28,7 @@ impl InnerArray for UnsafeArrayInner {
 impl<T: Dist> DistIteratorLauncher for UnsafeArray<T> {}
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*] ) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [$($lock:tt)*] ) => {
         paste! {
             fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $return_type$(<$($ret_gen),*>)?
             where
@@ -53,10 +53,8 @@ macro_rules! consumer_impl {
                 self.data.team.world_counters.inc_send_req(1);
                 self.data.task_group.counters.inc_send_req(1);
 
-                // self.data.team.scheduler.print_status();
-                let barrier = self.barrier_handle();
-                // let barrier_id  = barrier.barrier_id;
-                // println!("barrier_id {:?} creating dist iter handle",barrier_id);
+
+                let lock =  $($lock)*;
                 let inner = self.clone();
                 let reqs_future = Box::pin(async move{
 
@@ -68,10 +66,6 @@ macro_rules! consumer_impl {
                         Schedule::Guided => inner.sched_guided(am),
                         Schedule::WorkStealing => inner.sched_work_stealing(am),
                     };
-                    // remove req counters after individual ams have been launched.
-                    // inner.data.team.team_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
-                    // inner.data.team.world_counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
-                    // inner.data.task_group.counters.outstanding_reqs.fetch_sub(1,Ordering::SeqCst);
 
                     // increment launch counters to match req countersk
                     inner.data.team.team_counters.inc_launched(1);
@@ -80,7 +74,7 @@ macro_rules! consumer_impl {
                     // println!("barrier id {:?} done with dist iter sched {:?} {:?} {:?}",barrier_id,inner.data.team.team_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.team.world_counters.outstanding_reqs.load(Ordering::SeqCst), inner.data.task_group.counters.outstanding_reqs.load(Ordering::SeqCst));
                     reqs
                 });
-                $return_type::new(barrier,reqs_future,self)
+                $return_type::new(lock,reqs_future,self)
             }
 
         }
@@ -107,15 +101,17 @@ impl DistIteratorLauncher for UnsafeArrayInner {
     }
 
     consumer_impl!(
-    for_each<I, F>(iter: &I, op: F);
-    [DistIterForEachHandle];
-    [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
-    [
-        ForEach {
-            iter: iter.iter_clone(Sealed),
-            op,
-        }
-    ]);
+        for_each<I, F>(iter: &I, op: F);
+        [DistIterForEachHandle];
+        [I: DistributedIterator + 'static, F: Fn(I::Item) + SyncSend + Clone + 'static];
+        [
+            ForEach {
+                iter: iter.iter_clone(Sealed),
+                op,
+            }
+        ];
+        [iter.lock_if_needed(Sealed)]
+    );
 
     consumer_impl!(
         for_each_async<I, F, Fut>(iter: &I, op: F);
@@ -126,62 +122,73 @@ impl DistIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ]
+        ];
+        [iter.lock_if_needed(Sealed)]
     );
 
     consumer_impl!(
-    reduce<I, F>( iter: &I, op: F);
-    [DistIterReduceHandle<I::Item, F>];
-    [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
-    [
-        Reduce {
-            iter: iter.iter_clone(Sealed),
-            op,
-        }
-    ]);
+        reduce<I, F>( iter: &I, op: F);
+        [DistIterReduceHandle<I::Item, F>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps, F: Fn(I::Item, I::Item) -> I::Item + SyncSend + Clone + 'static];
+        [
+            Reduce {
+                iter: iter.iter_clone(Sealed),
+                op,
+            }
+        ];
+        [iter.lock_if_needed(Sealed)]
+    );
 
     consumer_impl!(
-    collect<I, A>( iter: &I, d: Distribution);
-    [DistIterCollectHandle<I::Item, A>];
-    [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
-    [
-        Collect {
-            iter: iter.iter_clone(Sealed).monotonic(),
-            distribution: d,
-            _phantom: PhantomData,
-        }
-    ]);
+        collect<I, A>( iter: &I, d: Distribution);
+        [DistIterCollectHandle<I::Item, A>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps,  A: AsyncTeamFrom<(Vec<I::Item>, Distribution)> + SyncSend + Clone + 'static,];
+        [
+            Collect {
+                iter: iter.iter_clone(Sealed).monotonic(),
+                distribution: d,
+                _phantom: PhantomData,
+            }
+        ];
+        [iter.lock_if_needed(Sealed)]
+    );
     consumer_impl!(
-    collect_async<I, A, B>( iter: &I, d: Distribution);
-    [DistIterCollectHandle<B, A>];
-    [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
-    [
-        CollectAsync {
-            iter: iter.iter_clone(Sealed).monotonic(),
-            distribution: d,
-            _phantom: PhantomData,
-        }
-    ]);
+        collect_async<I, A, B>( iter: &I, d: Distribution);
+        [DistIterCollectHandle<B, A>];
+        [I: DistributedIterator + 'static, I::Item: Future<Output = B> + Send + 'static,B: Dist + ArrayOps,A: AsyncTeamFrom<(Vec<B>, Distribution)> + SyncSend + Clone + 'static,];
+        [
+            CollectAsync {
+                iter: iter.iter_clone(Sealed).monotonic(),
+                distribution: d,
+                _phantom: PhantomData,
+            }
+        ];
+        [iter.lock_if_needed(Sealed)]
+    );
 
     consumer_impl!(
-    count<I>( iter: &I);
-    [DistIterCountHandle];
-    [I: DistributedIterator + 'static ];
-    [
-        Count {
-            iter: iter.iter_clone(Sealed),
-        }
-    ]);
+        count<I>( iter: &I);
+        [DistIterCountHandle];
+        [I: DistributedIterator + 'static ];
+        [
+            Count {
+                iter: iter.iter_clone(Sealed),
+            }
+        ];
+        [iter.lock_if_needed(Sealed)]
+    );
 
     consumer_impl!(
-    sum<I>(iter: &I);
-    [DistIterSumHandle<I::Item>];
-    [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
-    [
-        Sum {
-            iter: iter.iter_clone(Sealed),
-        }
-    ]);
+        sum<I>(iter: &I);
+        [DistIterSumHandle<I::Item>];
+        [I: DistributedIterator + 'static, I::Item: Dist + ArrayOps + std::iter::Sum, ];
+        [
+            Sum {
+                iter: iter.iter_clone(Sealed),
+            }
+        ];
+        [iter.lock_if_needed(Sealed)]
+    );
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index 73022949..fd722813 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -17,7 +17,7 @@ use std::sync::Arc;
 impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {}
 
 macro_rules! consumer_impl {
-    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*] ) => {
+    ($name:ident<$($generics:ident),*>($($arg:ident : $arg_ty:ty),*); [$return_type:ident$(<$($ret_gen:ty),*>)?]; [$($bounds:tt)+]; [$($am:tt)*]; [$($lock:tt)*] ) => {
         paste! {
             fn $name<$($generics),*>(&self, $($arg : $arg_ty),*) -> $return_type$(<$($ret_gen),*>)?
             where
@@ -37,17 +37,25 @@ macro_rules! consumer_impl {
                 $($bounds)+
             {
                 let am = $($am)*;
+                self.data.team.team_counters.inc_send_req(1);
+                self.data.team.world_counters.inc_send_req(1);
+                self.data.task_group.counters.inc_send_req(1);
+                let lock =  $($lock)*;
                 let inner = self.clone();
                 let reqs_future = Box::pin(async move{
-                    match sched {
-                    Schedule::Static => inner.sched_static(am),
-                    Schedule::Dynamic => inner.sched_dynamic(am),
-                    Schedule::Chunk(size) => inner.sched_chunk(am,size),
-                    Schedule::Guided => inner.sched_guided(am),
-                    Schedule::WorkStealing => inner.sched_work_stealing(am),
-                }
-            });
-                $return_type::new(reqs_future,self)
+                    let reqs = match sched {
+                        Schedule::Static => inner.sched_static(am),
+                        Schedule::Dynamic => inner.sched_dynamic(am),
+                        Schedule::Chunk(size) => inner.sched_chunk(am,size),
+                        Schedule::Guided => inner.sched_guided(am),
+                        Schedule::WorkStealing => inner.sched_work_stealing(am),
+                    };
+                    inner.data.team.team_counters.inc_launched(1);
+                    inner.data.team.world_counters.inc_launched(1);
+                    inner.data.task_group.counters.inc_launched(1);
+                    reqs
+                });
+                $return_type::new(lock,reqs_future,self)
             }
         }
     };
@@ -80,7 +88,8 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ]
+        ];
+        [iter.lock_if_needed(Sealed)]
     );
 
     consumer_impl!(
@@ -92,7 +101,8 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ]
+        ];
+        [iter.lock_if_needed(Sealed)]
     );
 
     consumer_impl!(
@@ -104,7 +114,8 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 iter: iter.iter_clone(Sealed),
                 op,
             }
-        ]
+        ];
+        [iter.lock_if_needed(Sealed)]
     );
 
     consumer_impl!(
@@ -117,7 +128,8 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 distribution: d,
                 _phantom: PhantomData,
             }
-        ]
+        ];
+        [iter.lock_if_needed(Sealed)]
     );
 
     consumer_impl!(
@@ -130,7 +142,8 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
                 distribution: d,
                 _phantom: PhantomData,
             }
-        ]
+        ];
+        [iter.lock_if_needed(Sealed)]
     );
 
     consumer_impl!(
@@ -141,18 +154,21 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
             Count {
                 iter: iter.iter_clone(Sealed),
             }
-        ]
+        ];
+        [iter.lock_if_needed(Sealed)]
     );
 
     consumer_impl!(
-    sum<I>(iter: &I);
-    [LocalIterSumHandle<I::Item>];
-    [I: LocalIterator + 'static, I::Item: SyncSend + for<'a> std::iter::Sum<&'a I::Item> + std::iter::Sum<I::Item>  , ];
-    [
-        Sum {
-            iter: iter.iter_clone(Sealed),
-        }
-    ]);
+        sum<I>(iter: &I);
+        [LocalIterSumHandle<I::Item>];
+        [I: LocalIterator + 'static, I::Item: SyncSend + for<'a> std::iter::Sum<&'a I::Item> + std::iter::Sum<I::Item>  , ];
+        [
+            Sum {
+                iter: iter.iter_clone(Sealed),
+            }
+        ];
+        [iter.lock_if_needed(Sealed)]
+    );
 
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.data.team.clone()
diff --git a/src/array/unsafe/local_chunks.rs b/src/array/unsafe/local_chunks.rs
index bcb4b8d4..eb2760aa 100644
--- a/src/array/unsafe/local_chunks.rs
+++ b/src/array/unsafe/local_chunks.rs
@@ -4,6 +4,8 @@ use crate::array::r#unsafe::*;
 use crate::array::LamellarArray;
 use crate::memregion::Dist;
 
+use self::iterator::IterLockFuture;
+
 /// An iterator over immutable (nonoverlapping) local chunks (of size chunk_size) of an [UnsafeArray]
 /// This struct is created by calling [UnsafeArray::local_chunks]
 #[derive(Clone)]
@@ -14,8 +16,11 @@ pub struct UnsafeLocalChunks<T: Dist> {
     array: UnsafeArray<T>,
 }
 
-impl<T: Dist> IterClone for UnsafeLocalChunks<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for UnsafeLocalChunks<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         UnsafeLocalChunks {
             chunk_size: self.chunk_size,
             index: self.index,
@@ -35,8 +40,11 @@ pub struct UnsafeLocalChunksMut<T: Dist> {
     array: UnsafeArray<T>,
 }
 
-impl<T: Dist> IterClone for UnsafeLocalChunksMut<T> {
-    fn iter_clone(&self, _: Sealed) -> Self {
+impl<T: Dist> InnerIter for UnsafeLocalChunksMut<T> {
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
+    fn iter_clone(&self, _s: Sealed) -> Self {
         UnsafeLocalChunksMut {
             chunk_size: self.chunk_size,
             index: self.index,
@@ -49,7 +57,7 @@ impl<T: Dist> IterClone for UnsafeLocalChunksMut<T> {
 impl<T: Dist + 'static> LocalIterator for UnsafeLocalChunks<T> {
     type Item = &'static [T];
     type Array = UnsafeArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
         let end_i = std::cmp::min(
             (start_i + cnt) * self.chunk_size,
@@ -112,7 +120,7 @@ impl<T: Dist + 'static> IndexedLocalIterator for UnsafeLocalChunks<T> {
 impl<T: Dist + 'static> LocalIterator for UnsafeLocalChunksMut<T> {
     type Item = &'static mut [T];
     type Array = UnsafeArray<T>;
-    fn init(&self, start_i: usize, cnt: usize) -> Self {
+    fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Self {
         //these are with respect to the single elements, not chunk indexing and cnt
         let end_i = std::cmp::min(
             (start_i + cnt) * self.chunk_size,
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index b67aa44b..fcaf8dea 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -99,7 +99,7 @@ impl<T: Dist> UnsafeArray<T> {
                                 },
                                 pe: self.inner.data.my_pe,
                             };
-                            reqs.push_back(self.exec_am_pe_tg(pe, am));
+                            reqs.push_back(self.spawn_am_pe_tg(pe, am));
                         } else {
                             let am = UnsafeSmallPutAm {
                                 array: self.clone().into(),
@@ -113,7 +113,7 @@ impl<T: Dist> UnsafeArray<T> {
                                         .to_vec()
                                 },
                             };
-                            reqs.push_back(self.exec_am_pe_tg(pe, am));
+                            reqs.push_back(self.spawn_am_pe_tg(pe, am));
                         }
                     }
                     ArrayRdmaCmd::GetAm => {
@@ -219,7 +219,7 @@ impl<T: Dist> UnsafeArray<T> {
                             data: unsafe { temp_memreg.to_base::<u8>().into() },
                             pe: self.inner.data.my_pe,
                         };
-                        reqs.push_back(self.exec_am_pe_tg(pe, am));
+                        reqs.push_back(self.spawn_am_pe_tg(pe, am));
                     } else {
                         let am = UnsafeSmallPutAm {
                             array: self.clone().into(),
@@ -234,7 +234,7 @@ impl<T: Dist> UnsafeArray<T> {
                                     .to_vec()
                             },
                         };
-                        reqs.push_back(self.exec_am_pe_tg(pe, am));
+                        reqs.push_back(self.spawn_am_pe_tg(pe, am));
                     }
                     if pe + 1 == num_pes {
                         overflow += 1;
@@ -1088,7 +1088,7 @@ impl<T: Dist + 'static> LamellarAm for InitSmallGetAm<T> {
                 start_index: self.index,
                 len: self.buf.len(),
             };
-            reqs.push(self.array.exec_am_pe_tg(pe, remote_am));
+            reqs.push(self.array.spawn_am_pe_tg(pe, remote_am));
         }
         unsafe {
             match self.array.inner.distribution {
diff --git a/src/barrier.rs b/src/barrier.rs
index f9dcdf81..b418c42f 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -498,6 +498,9 @@ impl Future for BarrierHandle {
 }
 
 impl LamellarRequest for BarrierHandle {
+    fn launch(&mut self) {
+        self.launched = true;
+    }
     fn blocking_wait(mut self) -> Self::Output {
         self.launched = true;
         match self.state {
diff --git a/src/darc.rs b/src/darc.rs
index 1848ec07..8ce00a11 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -54,6 +54,7 @@ use std::pin::Pin;
 use std::ptr::NonNull;
 use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 use std::sync::Arc;
+use std::time::Instant;
 // use std::time::Instant;
 
 // //use tracing::*;
@@ -62,10 +63,12 @@ use crate::active_messaging::{AMCounters, RemotePtr};
 use crate::barrier::Barrier;
 use crate::env_var::config;
 use crate::lamellae::{AllocationType, Backend, LamellaeComm, LamellaeRDMA};
+use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
 use crate::scheduler::LamellarTask;
-use crate::{IdError, LamellarEnv, LamellarTeam};
+use crate::warnings::RuntimeWarning;
+use crate::{IdError, LamellarEnv, LamellarTeam, TypedAmGroupResult};
 
 /// prelude for the darc module
 pub mod prelude;
@@ -139,7 +142,7 @@ pub struct DarcInner<T> {
     am_counters: *const AMCounters,
     team: *const LamellarTeamRT,
     item: *const T,
-    drop: Option<fn(&mut T)>,
+    drop: Option<fn(&mut T) -> bool>,
     valid: AtomicBool,
 }
 unsafe impl<T> Send for DarcInner<T> {} //we cant create DarcInners without going through the Darc interface which enforces  Sync+Send
@@ -429,7 +432,7 @@ impl<T: 'static> DarcInner<T> {
                 // );
                 // println!("[{:?}] {:?}", std::thread::current().id(), self);
                 reqs.push(
-                    team.exec_am_pe_tg(
+                    team.spawn_am_pe_tg(
                         pe,
                         FinishedAm {
                             cnt: cnt,
@@ -545,6 +548,7 @@ impl<T: 'static> DarcInner<T> {
         let mode_refs =
             unsafe { std::slice::from_raw_parts_mut(inner.mode_addr as *mut u8, inner.num_pes) };
         let orig_state = mode_refs[inner.my_pe];
+        inner.await_all().await;
         if team.num_pes() == 1 {
             while inner.local_cnt.load(Ordering::SeqCst) > 1 + extra_cnt {
                 async_std::task::yield_now().await;
@@ -884,30 +888,101 @@ impl<T: 'static> DarcInner<T> {
         // self.debug_print();
     }
 
-    // fn wait_all(&self) {
-    //     let mut temp_now = Instant::now();
-    //     // let mut first = true;
-    //     let team = self.team();
-    //     // team.flush();
-    //     let am_counters = self.am_counters();
-    //     while am_counters.outstanding_reqs.load(Ordering::SeqCst) > 0 {
-    //         // std::thread::yield_now();
-    //         team.scheduler.exec_task(); //mmight as well do useful work while we wait
-    //         if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
-    //             //|| first{
-    //             // println!(
-    //             //     "[{:?}] in darc wait_all mype: {:?} cnt: {:?} {:?}",
-    //             //     std::thread::current().id(),
-    //             //     team.world_pe,
-    //             //     am_counters.send_req_cnt.load(Ordering::SeqCst),
-    //             //     am_counters.outstanding_reqs.load(Ordering::SeqCst),
-    //             // );
-    //             temp_now = Instant::now();
-    //             // first = false;
-    //         }
-    //     }
-    //     // println!("done in wait all {:?}",std::time::SystemTime::now());
-    // }
+    pub(crate) fn wait_all(&self) {
+        // println!("wait_all called on pe: {}", self.world_pe);
+
+        RuntimeWarning::BlockingCall("wait_all", "await_all().await").print();
+        let am_counters = self.am_counters();
+
+        let mut temp_now = Instant::now();
+        let mut orig_reqs = am_counters.send_req_cnt.load(Ordering::SeqCst);
+        let mut orig_launched = am_counters.launched_req_cnt.load(Ordering::SeqCst);
+
+        // println!(
+        //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
+        //     self.world_pe,
+        //     self.am_counters.send_req_cnt.load(Ordering::SeqCst),
+        //     self.am_counters.outstanding_reqs.load(Ordering::SeqCst),
+        // );
+        while self.team().panic.load(Ordering::SeqCst) == 0
+            && (am_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                || orig_reqs != am_counters.send_req_cnt.load(Ordering::SeqCst)
+                || orig_launched != am_counters.launched_req_cnt.load(Ordering::SeqCst))
+        {
+            orig_reqs = am_counters.send_req_cnt.load(Ordering::SeqCst);
+            orig_launched = am_counters.launched_req_cnt.load(Ordering::SeqCst);
+            // std::thread::yield_now();
+            // self.flush();
+            if std::thread::current().id() != *crate::MAIN_THREAD {
+                self.team().scheduler.exec_task()
+            }; //mmight as well do useful work while we wait }
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
+                println!(
+                    "in team wait_all mype: {:?} cnt: {:?} {:?}",
+                    self.team().world_pe,
+                    am_counters.send_req_cnt.load(Ordering::SeqCst),
+                    am_counters.outstanding_reqs.load(Ordering::SeqCst),
+                );
+                temp_now = Instant::now();
+            }
+        }
+        if am_counters.send_req_cnt.load(Ordering::SeqCst)
+            != am_counters.launched_req_cnt.load(Ordering::SeqCst)
+        {
+            println!(
+                "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
+                self.team().world_pe,
+                am_counters.send_req_cnt.load(Ordering::SeqCst),
+                am_counters.outstanding_reqs.load(Ordering::SeqCst),
+                am_counters.launched_req_cnt.load(Ordering::SeqCst)
+            );
+            RuntimeWarning::UnspawnedTask(
+                "`wait_all` before all tasks/active messages have been spawned",
+            )
+            .print();
+        }
+        // println!(
+        //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
+        //     self.world_pe,
+        //     self.am_counters.send_req_cnt.load(Ordering::SeqCst),
+        //     self.am_counters.outstanding_reqs.load(Ordering::SeqCst),
+        // );
+    }
+    pub(crate) async fn await_all(&self) {
+        let mut temp_now = Instant::now();
+        let am_counters = self.am_counters();
+        while self.team().panic.load(Ordering::SeqCst) == 0
+            && (am_counters.outstanding_reqs.load(Ordering::SeqCst) > 0)
+        {
+            // std::thread::yield_now();
+            // self.flush();
+            async_std::task::yield_now().await;
+            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
+                println!(
+                    "in team wait_all mype: {:?} cnt: {:?} {:?}",
+                    self.team().world_pe,
+                    am_counters.send_req_cnt.load(Ordering::SeqCst),
+                    am_counters.outstanding_reqs.load(Ordering::SeqCst),
+                );
+                temp_now = Instant::now();
+            }
+        }
+        if am_counters.send_req_cnt.load(Ordering::SeqCst)
+            != am_counters.launched_req_cnt.load(Ordering::SeqCst)
+        {
+            println!(
+                "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
+                self.team().world_pe,
+                am_counters.send_req_cnt.load(Ordering::SeqCst),
+                am_counters.outstanding_reqs.load(Ordering::SeqCst),
+                am_counters.launched_req_cnt.load(Ordering::SeqCst)
+            );
+            RuntimeWarning::UnspawnedTask(
+                "`await_all` before all tasks/active messages have been spawned",
+            )
+            .print();
+        }
+    }
 }
 
 impl<T: 'static> fmt::Debug for DarcInner<T> {
@@ -1089,7 +1164,7 @@ impl<T> Darc<T> {
         team: U,
         item: T,
         state: DarcMode,
-        drop: Option<fn(&mut T)>,
+        drop: Option<fn(&mut T) -> bool>,
     ) -> Result<Darc<T>, IdError> {
         let team_rt = team.into().team.clone();
         let my_pe = team_rt.team_pe?;
@@ -1231,7 +1306,7 @@ impl<T> Darc<T> {
         team: U,
         item: T,
         state: DarcMode,
-        drop: Option<fn(&mut T)>,
+        drop: Option<fn(&mut T) -> bool>,
     ) -> Result<Darc<T>, IdError> {
         let team_rt = team.into().team.clone();
         let my_pe = team_rt.team_pe?;
@@ -1529,16 +1604,15 @@ macro_rules! launch_drop {
             );
         }
         // team.print_cnt();
-        let _ = team
-            .exec_am_local(DroppedWaitAM {
-                inner_addr: $inner_addr as *const u8 as usize,
-                mode_addr: $inner.mode_addr,
-                my_pe: $inner.my_pe,
-                num_pes: $inner.num_pes,
-                team: team.clone(),
-                phantom: PhantomData::<T>,
-            })
-            .spawn();
+        let mut am = team.exec_am_local(DroppedWaitAM {
+            inner_addr: $inner_addr as *const u8 as usize,
+            mode_addr: $inner.mode_addr,
+            my_pe: $inner.my_pe,
+            num_pes: $inner.num_pes,
+            team: team.clone(),
+            phantom: PhantomData::<T>,
+        });
+        am.launch();
     };
 }
 
@@ -1751,15 +1825,19 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
                 }
                 async_std::task::yield_now().await;
             }
-            {
-                let mut _item = Box::from_raw(wrapped.item as *mut T);
-                if let Some(my_drop) = wrapped.drop {
-                    // println!("Dropping darc {:x}", self.inner_addr);
-                    my_drop(&mut _item);
-                } else {
-                    // println!("no drop function for item {:x}", self.inner_addr);
+
+            // println!("going to drop object");
+
+            if let Some(my_drop) = wrapped.drop {
+                let mut dropped_done = false;
+                while !dropped_done {
+                    dropped_done = my_drop(&mut *(wrapped.item as *mut T));
+                    async_std::task::yield_now().await;
                 }
             }
+            let _ = Box::from_raw(wrapped.item as *mut T);
+            // println!("afterdrop object");
+
             while wrapped.weak_local_cnt.load(Ordering::SeqCst) != 0 {
                 //we can't actually free the darc memory until all weak pointers are gone too
                 async_std::task::yield_now().await;
@@ -1771,8 +1849,9 @@ impl<T: 'static> LamellarAM for DroppedWaitAM<T> {
             let _barrier = Box::from_raw(wrapped.barrier);
             self.team.lamellae.free(self.inner_addr);
             // println!(
-            //     "[{:?}]leaving DroppedWaitAM {:x}",
+            //     "[{:?}]leaving DroppedWaitAM {:?} {:x}",
             //     std::thread::current().id(),
+            //     self,
             //     self.inner_addr
             // );
         }
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index a6074f4f..e39d28da 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -9,6 +9,7 @@ use std::sync::Arc;
 use crate::active_messaging::RemotePtr;
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
+use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
@@ -62,8 +63,19 @@ impl<T> DistRwLock<T> {
     pub(crate) fn into_inner(self) -> T {
         self.data.into_inner()
     }
-}
-impl<T> DistRwLock<T> {
+
+    pub(crate) fn dirty_num_locks(&self) -> usize {
+        let mut locks = 0;
+        locks += self.readers.load(Ordering::SeqCst);
+        if self.writer.load(Ordering::SeqCst) != self.team.num_pes {
+            locks += 1;
+        }
+        if self.collective_writer.load(Ordering::SeqCst) != self.team.num_pes {
+            locks += 1;
+        }
+        locks
+    }
+
     async fn async_reader_lock(&self, _pe: usize) {
         loop {
             while self.writer.load(Ordering::SeqCst) != self.team.num_pes {
@@ -119,7 +131,7 @@ impl<T> DistRwLock<T> {
     }
 
     async fn async_collective_writer_lock(&self, pe: usize, collective_cnt: usize) {
-        println!("{:?} collective writer lock {:?}", pe, collective_cnt);
+        // println!("{:?} collective writer lock {:?}", pe, collective_cnt);
         // first lets set the normal writer lock, but will set it to a unique id all the PEs should have (it is initialized to num_pes+1 and is incremented by one after each lock)
         if pe == 0 {
             self.async_writer_lock(collective_cnt).await;
@@ -236,7 +248,9 @@ impl LamellarAM for LockAm {
     async fn exec() {
         // println!("In lock am {:?}", self);
         // let lock = {
-        let rwlock = unsafe { &*(self.rwlock_addr as *mut DarcInner<DistRwLock<()>>) }.item(); //we dont actually care about the "type" we wrap here, we just need access to the meta data for the darc
+        let inner = unsafe { &*(self.rwlock_addr as *mut DarcInner<DistRwLock<()>>) };
+        // inner.deserialize_update_cnts(self.orig_pe);
+        let rwlock = inner.item(); //we dont actually care about the "type" we wrap here, we just need access to the meta data for the darc
         match self.lock_type {
             LockType::Read => {
                 rwlock.async_reader_lock(self.orig_pe).await;
@@ -263,7 +277,10 @@ struct UnlockAm {
 impl LamellarAM for UnlockAm {
     async fn exec() {
         // println!("In unlock am {:?}", self);
-        let rwlock = unsafe { &*(self.rwlock_addr as *mut DarcInner<DistRwLock<()>>) }.item(); //we dont actually care about the "type" we wrap here, we just need access to the meta data for the darc
+        let inner = unsafe { &*(self.rwlock_addr as *mut DarcInner<DistRwLock<()>>) }; //we dont actually care about the "type" we wrap here, we just need access to the meta data for the darc
+                                                                                       // inner.deserialize_update_cnts(self.orig_pe);
+        let rwlock = inner.item();
+
         unsafe {
             match self.lock_type {
                 LockType::Read => rwlock.reader_unlock(self.orig_pe),
@@ -311,17 +328,17 @@ impl<T> Drop for GlobalRwDarcReadGuard<T> {
                 0,
                 inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
             );
-            let _ = team
-                .exec_am_pe_tg(
-                    0,
-                    UnlockAm {
-                        rwlock_addr: remote_rwlock_addr,
-                        orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                        lock_type: LockType::Read,
-                    },
-                    Some(inner.am_counters()),
-                )
-                .spawn();
+            let mut am = team.spawn_am_pe_tg(
+                0,
+                UnlockAm {
+                    rwlock_addr: remote_rwlock_addr,
+                    orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                    lock_type: LockType::Read,
+                },
+                Some(inner.am_counters()),
+            );
+            // am.launch();
+            // inner.serialize_update_cnts(1);
         }
     }
 }
@@ -362,17 +379,17 @@ impl<T> Drop for GlobalRwDarcWriteGuard<T> {
             0,
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        let _ = team
-            .exec_am_pe_tg(
-                0,
-                UnlockAm {
-                    rwlock_addr: remote_rwlock_addr,
-                    orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                    lock_type: LockType::Write,
-                },
-                Some(inner.am_counters()),
-            )
-            .spawn();
+        let mut am = team.spawn_am_pe_tg(
+            0,
+            UnlockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::Write,
+            },
+            Some(inner.am_counters()),
+        );
+        // am.launch();
+        // inner.serialize_update_cnts(1);
     }
 }
 
@@ -412,17 +429,17 @@ impl<T> Drop for GlobalRwDarcCollectiveWriteGuard<T> {
             0,
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        let _ = team
-            .exec_am_pe_tg(
-                0,
-                UnlockAm {
-                    rwlock_addr: remote_rwlock_addr,
-                    orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
-                    lock_type: LockType::CollectiveWrite(self.collective_cnt),
-                },
-                Some(inner.am_counters()),
-            )
-            .spawn();
+        let mut am = team.spawn_am_pe_tg(
+            0,
+            UnlockAm {
+                rwlock_addr: remote_rwlock_addr,
+                orig_pe: team.team_pe.expect("darcs cant exist on non team members"),
+                lock_type: LockType::CollectiveWrite(self.collective_cnt),
+            },
+            Some(inner.am_counters()),
+        );
+        // am.launch();
+        // inner.serialize_update_cnts(1);
     }
 }
 
@@ -746,13 +763,17 @@ impl<T> GlobalRwDarc<T> {
         item: T,
     ) -> Result<GlobalRwDarc<T>, IdError> {
         Ok(GlobalRwDarc {
-            darc: Darc::try_new(
+            darc: Darc::try_new_with_drop(
                 team.clone(),
                 DistRwLock::new(item, team),
                 DarcMode::GlobalRw,
+                Some(GlobalRwDarc::drop),
             )?,
         })
     }
+    pub(crate) fn drop(lock: &mut DistRwLock<T>) -> bool {
+        lock.dirty_num_locks() == 0
+    }
 
     // pub(crate) fn try_new<U: Clone + Into<IntoLamellarTeam>>(
     //     team: U,
@@ -793,12 +814,19 @@ impl<T> GlobalRwDarc<T> {
             inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                 .expect("invalid darc pointer"),
         };
+        let wrapped_lock = WrappedInner {
+            inner: NonNull::new(self.darc.inner as *mut DarcInner<DistRwLock<T>>)
+                .expect("invalid darc pointer"),
+        };
         let team = self.darc.inner().team().clone();
         IntoDarcHandle {
             darc: self.into(),
             team,
             launched: false,
             outstanding_future: Box::pin(async move {
+                while wrapped_lock.item().dirty_num_locks() != 0 {
+                    async_std::task::yield_now().await;
+                }
                 DarcInner::block_on_outstanding(wrapped_inner, DarcMode::Darc, 0).await;
             }),
         }
@@ -823,19 +851,26 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     ///
     /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
-    /// let five_as_localdarc = world.block_on(async move {five.into_localrw()});
+    /// let five_as_localdarc = world.block_on(async move {five.into_localrw().await});
     /// ```
     pub fn into_localrw(self) -> IntoLocalRwDarcHandle<T> {
         let wrapped_inner = WrappedInner {
             inner: NonNull::new(self.darc.inner as *mut DarcInner<T>)
                 .expect("invalid darc pointer"),
         };
+        let wrapped_lock = WrappedInner {
+            inner: NonNull::new(self.darc.inner as *mut DarcInner<DistRwLock<T>>)
+                .expect("invalid darc pointer"),
+        };
         let team = self.darc.inner().team().clone();
         IntoLocalRwDarcHandle {
             darc: self.into(),
             team,
             launched: false,
             outstanding_future: Box::pin(async move {
+                while wrapped_lock.item().dirty_num_locks() != 0 {
+                    async_std::task::yield_now().await;
+                }
                 DarcInner::block_on_outstanding(wrapped_inner, DarcMode::LocalRw, 0).await;
             }),
         }
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
index c89f96dd..292a6dc2 100644
--- a/src/darc/handle.rs
+++ b/src/darc/handle.rs
@@ -835,6 +835,7 @@ impl<T: Sync + Send> Future for IntoDarcHandle<T> {
             src_pe: this.darc.src_pe(),
         };
         darc.inner_mut().update_item(Box::into_raw(Box::new(item)));
+        darc.inner_mut().drop = None;
         Poll::Ready(darc)
     }
 }
@@ -938,6 +939,7 @@ impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
         };
         darc.inner_mut()
             .update_item(Box::into_raw(Box::new(Arc::new(RwLock::new(item)))));
+        darc.inner_mut().drop = None;
         Poll::Ready(LocalRwDarc { darc })
     }
 }
@@ -1043,6 +1045,7 @@ impl<T: Sync + Send> Future for IntoGlobalRwDarcHandle<T> {
                 item,
                 this.team.clone(),
             ))));
+        darc.inner_mut().drop = Some(GlobalRwDarc::drop);
         Poll::Ready(GlobalRwDarc { darc })
     }
 }
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index e028a625..6d4689f8 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -11,7 +11,7 @@ use crate::lamellae::LamellaeRDMA;
 use crate::lamellar_team::IntoLamellarTeam;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
-use super::handle::{
+pub(crate) use super::handle::{
     IntoDarcHandle, IntoGlobalRwDarcHandle, LocalRwDarcReadHandle, LocalRwDarcWriteHandle,
 };
 
diff --git a/src/lamellar_request.rs b/src/lamellar_request.rs
index 7016abaf..35e8b12b 100755
--- a/src/lamellar_request.rs
+++ b/src/lamellar_request.rs
@@ -21,6 +21,7 @@ pub(crate) enum InternalResult {
 //#[doc(hidden)]
 // #[enum_dispatch]
 pub(crate) trait LamellarRequest: Future {
+    fn launch(&mut self);
     fn blocking_wait(self) -> Self::Output;
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool;
     fn val(&self) -> Self::Output;
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 635048ae..0fe653c5 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -160,6 +160,9 @@ impl<T: AmDist> TaskGroupAmHandle<T> {
 }
 
 impl<T: AmDist> LamellarRequest for TaskGroupAmHandle<T> {
+    fn launch(&mut self) {
+        self.launch_am_if_needed();
+    }
     fn blocking_wait(mut self) -> Self::Output {
         self.launch_am_if_needed();
         let mut res = self.inner.data.lock().remove(&self.sub_id);
@@ -361,6 +364,9 @@ impl<T: AmDist> TaskGroupMultiAmHandle<T> {
 }
 
 impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
+    fn launch(&mut self) {
+        self.launch_am_if_needed();
+    }
     fn blocking_wait(mut self) -> Self::Output {
         self.launch_am_if_needed();
         while !self.inner.data.lock().contains_key(&self.sub_id) {
@@ -538,6 +544,9 @@ impl<T: Send + 'static> TaskGroupLocalAmHandle<T> {
 }
 
 impl<T: 'static> LamellarRequest for TaskGroupLocalAmHandle<T> {
+    fn launch(&mut self) {
+        self.launch_am_if_needed();
+    }
     fn blocking_wait(mut self) -> Self::Output {
         self.launch_am_if_needed();
         let mut res = self.inner.data.lock().remove(&self.sub_id);
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index ac75aba6..9b24a24b 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1395,7 +1395,6 @@ impl LamellarTeamRT {
         let mut orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
         let mut orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
 
-
         // println!(
         //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
         //     self.world_pe,
@@ -1403,13 +1402,13 @@ impl LamellarTeamRT {
         //     self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
         // );
         while self.panic.load(Ordering::SeqCst) == 0
-            && ((self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0 
-                || orig_reqs !=  self.team_counters.send_req_cnt.load(Ordering::SeqCst)
-                || orig_launched !=  self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
+            && ((self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                || orig_reqs != self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+                || orig_launched != self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
                 || (self.parent.is_none()
                     && self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0))
         {
-            orig_reqs =  self.team_counters.send_req_cnt.load(Ordering::SeqCst);
+            orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
             orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
             // std::thread::yield_now();
             // self.flush();
@@ -1425,12 +1424,12 @@ impl LamellarTeamRT {
                 );
                 temp_now = Instant::now();
             }
-            
         }
         if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
             != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
-            || (self.parent.is_none() && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
-                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
+            || (self.parent.is_none()
+                && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                    != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
         {
             println!(
                 "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
@@ -1452,7 +1451,6 @@ impl LamellarTeamRT {
         // );
     }
     pub(crate) async fn await_all(&self) {
-        
         let mut temp_now = Instant::now();
         while self.panic.load(Ordering::SeqCst) == 0
             && (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
@@ -1474,8 +1472,9 @@ impl LamellarTeamRT {
         }
         if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
             != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
-            || (self.parent.is_none() && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
-                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
+            || (self.parent.is_none()
+                && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                    != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
         {
             RuntimeWarning::UnspawnedTask(
                 "`await_all` before all tasks/active messages have been spawned",
@@ -1719,15 +1718,7 @@ impl LamellarTeamRT {
         };
         self.world_counters.inc_send_req(1);
         self.team_counters.inc_send_req(1);
-        // println!(
-        //     "req_id: {:?} tc: {:?} wc: {:?}",
-        //     id,
-        //     self.team_counters.outstanding_reqs.load(Ordering::Relaxed),
-        //     self.world_counters.outstanding_reqs.load(Ordering::Relaxed)
-        // );
-        // println!("cnts: t: {} w: {} tg: {:?}",self.team_counters.outstanding_reqs.load(Ordering::Relaxed),self.world_counters.outstanding_reqs.load(Ordering::Relaxed), tg_outstanding_reqs.as_ref().map(|x| x.load(Ordering::Relaxed)));
 
-        // println!("req_id: {:?}", id);
         let world = if let Some(world) = &self.world {
             world.clone()
         } else {
@@ -1744,13 +1735,6 @@ impl LamellarTeamRT {
             team_addr: self.remote_ptr_addr,
         };
 
-        // println!("[{:?}] team exec am pe tg", std::thread::current().id());
-        // self.scheduler.submit_am(Am::Remote(req_data, func));
-
-        // Box::new(LamellarRequestHandle {
-        //     inner: req,
-        //     _phantom: PhantomData,
-        // })
         AmHandle {
             inner: req,
             am: Some((Am::Remote(req_data, func), 1)),
@@ -1759,6 +1743,74 @@ impl LamellarTeamRT {
         .into()
     }
 
+    pub(crate) fn spawn_am_pe_tg<F>(
+        self: &Pin<Arc<LamellarTeamRT>>,
+        pe: usize,
+        am: F,
+        task_group_cnts: Option<Arc<AMCounters>>,
+    ) -> AmHandle<F::Output>
+    where
+        F: RemoteActiveMessage + LamellarAM + crate::Serialize + 'static,
+    {
+        // println!("team exec am pe tg");
+        if let Some(task_group_cnts) = task_group_cnts.as_ref() {
+            task_group_cnts.inc_outstanding(1);
+            task_group_cnts.inc_launched(1);
+            task_group_cnts.inc_send_req(1);
+        }
+        assert!(pe < self.arch.num_pes());
+
+        let req = Arc::new(AmHandleInner {
+            ready: AtomicBool::new(false),
+            data: Cell::new(None),
+            waker: Mutex::new(None),
+            team_counters: self.team_counters.clone(),
+            world_counters: self.world_counters.clone(),
+            tg_counters: task_group_cnts,
+            user_handle: AtomicU8::new(1),
+            scheduler: self.scheduler.clone(),
+        });
+        let req_result = Arc::new(LamellarRequestResult::Am(req.clone()));
+        let req_ptr = Arc::into_raw(req_result);
+        // Arc::increment_strong_count(req_ptr); //we would need to do this for the exec_all command
+        let id = ReqId {
+            id: req_ptr as usize,
+            sub_id: 0,
+        };
+
+        self.world_counters.inc_outstanding(1);
+        self.world_counters.inc_launched(1);
+        self.world_counters.inc_send_req(1);
+        self.team_counters.inc_outstanding(1);
+        self.team_counters.inc_launched(1);
+        self.team_counters.inc_send_req(1);
+
+        let world = if let Some(world) = &self.world {
+            world.clone()
+        } else {
+            self.clone()
+        };
+        let func: LamellarArcAm = Arc::new(am);
+        let req_data = ReqMetaData {
+            src: self.world_pe,
+            dst: Some(self.arch.world_pe(pe).expect("pe not member of team")),
+            id: id,
+            lamellae: self.lamellae.clone(),
+            world: world,
+            team: self.clone(),
+            team_addr: self.remote_ptr_addr,
+        };
+
+        self.scheduler.submit_am(Am::Remote(req_data, func));
+
+        AmHandle {
+            inner: req,
+            am: None,
+            _phantom: PhantomData,
+        }
+        .into()
+    }
+
     //#[tracing::instrument(skip_all)]
     pub(crate) fn am_group_exec_am_pe_tg<F, O>(
         self: &Pin<Arc<LamellarTeamRT>>,
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index c14438ff..b2048841 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -135,7 +135,7 @@ macro_rules! fetch_add_test{
                 let mut reqs = vec![];
                 for _i in 0..(pe_max_val as usize){
                     #[allow(unused_unsafe)]
-                    reqs.push( unsafe{ array.fetch_add(idx,1 as $t)});
+                    reqs.push( unsafe{ array.fetch_add(idx,1 as $t).spawn()});
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
@@ -452,14 +452,14 @@ macro_rules! input_test{
             let mut reqs = vec![];
             for i in 0..array.len(){
                 #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(i,1)});
+                reqs.push( unsafe{ array.batch_fetch_add(i,1).spawn()});
             }
             check_results!($array,array,num_pes,reqs,"T");
             //individual T------------------------------
             let mut reqs = vec![];
             for i in 0..array.len(){
                 #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(&i,1)});
+                reqs.push( unsafe{ array.batch_fetch_add(&i,1).spawn()});
             }
             check_results!($array,array,num_pes,reqs,"&T");
             //&[T]------------------------------
@@ -471,16 +471,16 @@ macro_rules! input_test{
 
             let mut reqs = vec![];
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(idx_slice,1)});
+            reqs.push( unsafe{ array.batch_fetch_add(idx_slice,1).spawn()});
             check_results!($array,array,num_pes,reqs,"&[T]");
             // single_idx multi_ val
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(_my_pe,&vals)});
+            reqs.push( unsafe{ array.batch_fetch_add(_my_pe,&vals).spawn()});
             let real_val = array.len();
             check_results!($array,array,num_pes, real_val,reqs,"&[T]");
             // multi_idx multi_ val
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(idx_slice,vals_slice)});
+            reqs.push(unsafe{array.batch_fetch_add(idx_slice,vals_slice).spawn()});
 
             check_results!($array,array,num_pes,reqs,"&[T]");
             //scoped &[T]------------------------------
@@ -489,27 +489,27 @@ macro_rules! input_test{
                 let vec=(0..array.len()).collect::<Vec<usize>>();
                 let slice = &vec[..];
                 #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(slice,1)});
+                reqs.push( unsafe{ array.batch_fetch_add(slice,1).spawn()});
             }
             check_results!($array,array,num_pes,reqs,"scoped &[T]");
             // Vec<T>------------------------------
             let vec=(0..array.len()).collect::<Vec<usize>>();
             let mut reqs = vec![];
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(vec,1)});
+            reqs.push( unsafe{ array.batch_fetch_add(vec,1).spawn()});
             check_results!($array,array,num_pes,reqs,"Vec<T>");
             // &Vec<T>------------------------------
             let mut reqs = vec![];
             let vec=(0..array.len()).collect::<Vec<usize>>();
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(&vec,1)});
+            reqs.push( unsafe{ array.batch_fetch_add(&vec,1).spawn()});
             check_results!($array,array,num_pes,reqs,"&Vec<T>");
             // Scoped Vec<T>------------------------------
             let mut reqs = vec![];
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
                 #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(vec,1)});
+                reqs.push( unsafe{ array.batch_fetch_add(vec,1).spawn()});
             }
             check_results!($array,array,num_pes,reqs,"scoped Vec<T>");
             // Scoped &Vec<T>------------------------------
@@ -517,7 +517,7 @@ macro_rules! input_test{
             {
                 let vec=(0..array.len()).collect::<Vec<usize>>();
                 #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(&vec,1)});
+                reqs.push( unsafe{ array.batch_fetch_add(&vec,1).spawn()});
             }
             check_results!($array,array,num_pes,reqs,"scoped &Vec<T>");
 
@@ -529,7 +529,7 @@ macro_rules! input_test{
                 for i in 0..array.len(){
                     slice[i]=i;
                 }
-                reqs.push(array.batch_fetch_add(slice,1));
+                reqs.push(array.batch_fetch_add(slice,1).spawn());
                 check_results!($array,array,num_pes,reqs,"scoped &LMR<T>");
             }
 
@@ -542,7 +542,7 @@ macro_rules! input_test{
                     slice[i]=i;
                 }
 
-                reqs.push(array.batch_fetch_add(slice,1));
+                reqs.push(array.batch_fetch_add(slice,1).spawn());
                 check_results!($array,array,num_pes,reqs,"scoped SMR<T>");
             }
             // UnsafeArray<T>------------------------------
@@ -552,7 +552,7 @@ macro_rules! input_test{
             // UnsafeArray<T>------------------------------
             let mut reqs = vec![];
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1)});
+            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1).spawn()});
             check_results!($array,array,num_pes,reqs,"&UnsafeArray<T>");
 
             // ReadOnlyArray<T>------------------------------
@@ -564,7 +564,7 @@ macro_rules! input_test{
             // ReadOnlyArray<T>------------------------------
             let mut reqs = vec![];
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1)});
+            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1).spawn()});
             check_results!($array,array,num_pes,reqs,"&ReadOnlyArray<T>");
 
             // AtomicArray<T>------------------------------
@@ -576,7 +576,7 @@ macro_rules! input_test{
             // AtomicArray<T>------------------------------
             let mut reqs = vec![];
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(&input_array.local_data(),1)});
+            reqs.push(unsafe{array.batch_fetch_add(&input_array.local_data(),1).spawn()});
             check_results!($array,array,num_pes,reqs,"&AtomicArray<T>");
 
             // LocalLockArray<T>------------------------------
@@ -590,7 +590,7 @@ macro_rules! input_test{
             let local_data = input_array.read_local_data().block();
             // println!("local lock array len: {:?}", local_data.deref());
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(&local_data,1)});
+            reqs.push(unsafe{array.batch_fetch_add(&local_data,1).spawn()});
             drop(local_data);
             check_results!($array,array,num_pes,reqs,"&LocalLockArray<T>");
 
@@ -603,7 +603,7 @@ macro_rules! input_test{
             // GlobalLockArray<T>------------------------------
             let mut reqs = vec![];
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(&input_array.read_local_data().block(),1)});
+            reqs.push(unsafe{array.batch_fetch_add(&input_array.read_local_data().block(),1).spawn()});
             check_results!($array,array,num_pes,reqs,"&GlobalLockArray<T>");
        }
     }

From 59dde95981a73a65fe1c057de0849ef6ede14ea3 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 7 Nov 2024 09:01:56 -0800
Subject: [PATCH 103/116] convert distributed object creation to utilize
 handles for blocking, spawning, or awaiting

---
 .../am_local_memregions.rs                    |   2 +-
 examples/array_examples/array_am.rs           |   6 +-
 examples/array_examples/array_batch_add.rs    |   2 +-
 .../array_consumer_schedules.rs               |   2 +-
 .../array_first_last_global_indices.rs        |   4 +-
 examples/array_examples/array_ops.rs          |  10 +-
 examples/array_examples/array_put_get.rs      |  17 +-
 .../array_examples/atomic_compare_exchange.rs |   4 +-
 examples/array_examples/dist_array_reduce.rs  |  10 +-
 .../array_examples/distributed_iteration.rs   |   4 +-
 examples/array_examples/generic_array.rs      |   2 +-
 examples/array_examples/global_lock_array.rs  |   2 +-
 examples/array_examples/histo.rs              |   2 +-
 examples/array_examples/local_iteration.rs    |   4 +-
 examples/array_examples/onesided_iteration.rs |  10 +-
 examples/bandwidths/am_bw_get.rs              |   6 +-
 examples/bandwidths/am_group_bw_get.rs        |   6 +-
 examples/bandwidths/atomic_array_get_bw.rs    |   4 +-
 examples/bandwidths/atomic_array_put_bw.rs    |   4 +-
 examples/bandwidths/get_bw.rs                 |   7 +-
 .../global_lock_atomic_array_get_bw.rs        |   4 +-
 .../global_lock_atomic_array_put_bw.rs        |   4 +-
 .../local_lock_atomic_array_get_bw.rs         |   4 +-
 .../local_lock_atomic_array_put_bw.rs         |   4 +-
 examples/bandwidths/put_bw.rs                 |   9 +-
 examples/bandwidths/readonly_array_get_bw.rs  |   4 +-
 .../readonly_array_get_unchecked_bw.rs        |   4 +-
 examples/bandwidths/unsafe_array_get_bw.rs    |   4 +-
 .../unsafe_array_get_unchecked_bw.rs          |   4 +-
 examples/bandwidths/unsafe_array_put_bw.rs    |   4 +-
 .../unsafe_array_put_unchecked_bw.rs          |   4 +-
 examples/bandwidths/unsafe_array_store_bw.rs  |   4 +-
 examples/darc_examples/darc.rs                |  14 +-
 examples/darc_examples/stress_test.rs         |   2 +-
 examples/darc_examples/string_darc.rs         |   5 +-
 examples/hello_world/hello_world_array.rs     |   2 +-
 .../hello_world_array_iteration.rs            |   2 +-
 examples/kernels/am_gemm.rs                   |  19 +-
 examples/kernels/cached_am_gemm.rs            |  41 +-
 examples/kernels/dft_proxy.rs                 |  30 +-
 examples/kernels/parallel_array_gemm.rs       |   6 +-
 .../kernels/parallel_blocked_array_gemm.rs    |   8 +-
 .../safe_parallel_blocked_array_gemm.rs       |  10 +-
 examples/kernels/serial_array_gemm.rs         |   6 +-
 examples/misc/dist_hashmap.rs                 |   2 +-
 examples/misc/lamellar_env.rs                 |   8 +-
 examples/misc/ping_pong.rs                    |  26 +-
 examples/rdma_examples/rdma_am.rs             |  13 +-
 examples/rdma_examples/rdma_get.rs            |   7 +-
 examples/rdma_examples/rdma_put.rs            |   4 +-
 impl/src/lib.rs                               |   2 +-
 src/active_messaging.rs                       |   2 +-
 src/array.rs                                  |  82 +--
 src/array/atomic.rs                           |  95 +--
 src/array/atomic/handle.rs                    | 148 +++++
 src/array/generic_atomic.rs                   |  44 +-
 src/array/generic_atomic/handle.rs            |  92 +++
 src/array/generic_atomic/rdma.rs              |   2 +-
 src/array/global_lock_atomic.rs               |  81 +--
 src/array/global_lock_atomic/handle.rs        | 118 +++-
 src/array/global_lock_atomic/rdma.rs          |   2 +-
 src/array/iterator/consumer.rs                |   1 -
 src/array/iterator/distributed_iterator.rs    |  56 +-
 .../distributed_iterator/consumer/count.rs    |  28 +-
 .../distributed_iterator/consumer/reduce.rs   |  36 +-
 .../distributed_iterator/consumer/sum.rs      |  37 +-
 src/array/iterator/local_iterator.rs          |  64 +-
 .../local_iterator/consumer/collect.rs        |  10 +-
 src/array/iterator/mod.rs                     |  12 +-
 src/array/iterator/one_sided_iterator.rs      |  22 +-
 .../iterator/one_sided_iterator/chunks.rs     |   2 +-
 src/array/local_lock_atomic.rs                |  83 +--
 src/array/local_lock_atomic/handle.rs         | 191 +++++-
 src/array/local_lock_atomic/iteration.rs      |   2 +-
 src/array/local_lock_atomic/local_chunks.rs   |   4 +-
 src/array/local_lock_atomic/rdma.rs           |   2 +-
 src/array/native_atomic.rs                    |  35 +-
 src/array/native_atomic/handle.rs             |  96 +++
 src/array/native_atomic/rdma.rs               |   2 +-
 src/array/operations/access.rs                |  28 +-
 src/array/operations/arithmetic.rs            |  92 +--
 src/array/operations/bitwise.rs               |  60 +-
 src/array/operations/compare_exchange.rs      |  40 +-
 src/array/operations/handle.rs                |   2 +-
 src/array/operations/read_only.rs             |  12 +-
 src/array/operations/shift.rs                 |  34 +-
 src/array/prelude.rs                          |  18 +-
 src/array/read_only.rs                        |  60 +-
 src/array/read_only/handle.rs                 |  93 +++
 src/array/read_only/local_chunks.rs           |   2 +-
 src/array/read_only/rdma.rs                   |   4 +-
 src/array/unsafe.rs                           | 198 ++----
 src/array/unsafe/handle.rs                    |  93 +++
 src/array/unsafe/iteration.rs                 |  12 +-
 src/array/unsafe/local_chunks.rs              |   4 +-
 src/array/unsafe/rdma.rs                      |  20 +-
 src/barrier.rs                                |  38 +-
 src/darc.rs                                   | 346 ++++++-----
 src/darc/global_rw_darc.rs                    |  62 +-
 src/darc/handle.rs                            | 306 +++++++--
 src/darc/local_rw_darc.rs                     |  36 +-
 src/lamellar_team.rs                          |  80 ++-
 src/lamellar_world.rs                         |  10 +-
 src/lib.rs                                    |   4 +-
 src/memregion.rs                              |   7 +-
 src/memregion/shared.rs                       |  58 +-
 tests/add.rs                                  |   2 +-
 tests/and.rs                                  |   2 +-
 tests/array/arithmetic_ops/add_test.rs        | 319 +++++-----
 tests/array/arithmetic_ops/div_test.rs        |   2 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |  10 +-
 tests/array/arithmetic_ops/fetch_div_test.rs  |   2 +-
 tests/array/arithmetic_ops/fetch_mul_test.rs  |   2 +-
 tests/array/arithmetic_ops/fetch_rem_test.rs  |   2 +-
 tests/array/arithmetic_ops/fetch_sub_test.rs  |   2 +-
 tests/array/arithmetic_ops/mul_test.rs        |   2 +-
 tests/array/arithmetic_ops/rem_test.rs        |   2 +-
 tests/array/arithmetic_ops/sub_test.rs        |   2 +-
 tests/array/array_into_test.rs                |   2 +-
 .../array/atomic_ops/compare_exchange_test.rs | 588 +++++++++---------
 tests/array/atomic_ops/load_store_test.rs     | 186 +++---
 tests/array/atomic_ops/swap_test.rs           |   2 +-
 tests/array/bitwise_ops/and_test.rs           |   2 +-
 tests/array/bitwise_ops/fetch_and_test.rs     |   2 +-
 tests/array/bitwise_ops/fetch_or_test.rs      |   2 +-
 tests/array/bitwise_ops/fetch_xor_test.rs     |   2 +-
 tests/array/bitwise_ops/or_test.rs            |   2 +-
 tests/array/bitwise_ops/xor_test.rs           |   2 +-
 tests/array/local_only/clone.rs               |   2 +-
 tests/array/local_only/immutable_borrow.rs    |   2 +-
 tests/array/rdma/blocking_get_test.rs         |   4 +-
 tests/array/rdma/get_test.rs                  |   4 +-
 tests/array/rdma/put_test.rs                  |   4 +-
 tests/blocking_get.rs                         |   2 +-
 tests/compare_exchange.rs                     |   2 +-
 tests/div.rs                                  |   2 +-
 tests/fetch_add.rs                            |   4 +-
 tests/fetch_and.rs                            |   2 +-
 tests/fetch_div.rs                            |   2 +-
 tests/fetch_mul.rs                            |   2 +-
 tests/fetch_or.rs                             |   2 +-
 tests/fetch_rem.rs                            |   2 +-
 tests/fetch_sub.rs                            |   2 +-
 tests/fetch_xor.rs                            |   2 +-
 tests/get.rs                                  |   2 +-
 tests/load_store.rs                           |   2 +-
 tests/mul.rs                                  |   2 +-
 tests/or.rs                                   |   2 +-
 tests/put.rs                                  |   2 +-
 tests/rem.rs                                  |   2 +-
 tests/sub.rs                                  |   2 +-
 tests/swap.rs                                 |   2 +-
 tests/xor.rs                                  |   2 +-
 153 files changed, 2912 insertions(+), 1721 deletions(-)
 create mode 100644 src/array/atomic/handle.rs
 create mode 100644 src/array/generic_atomic/handle.rs
 create mode 100644 src/array/native_atomic/handle.rs
 create mode 100644 src/array/read_only/handle.rs
 create mode 100644 src/array/unsafe/handle.rs

diff --git a/examples/active_message_examples/am_local_memregions.rs b/examples/active_message_examples/am_local_memregions.rs
index 36e1a2bf..1a2fa8cc 100644
--- a/examples/active_message_examples/am_local_memregions.rs
+++ b/examples/active_message_examples/am_local_memregions.rs
@@ -50,7 +50,7 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_one_sided_mem_region::<u8>(10);
+    let array = world.alloc_one_sided_mem_region::<u8>(10).expect("Enough memory should exist");
 
     let mut rng = rand::thread_rng();
     let pes = Uniform::from(0..num_pes);
diff --git a/examples/array_examples/array_am.rs b/examples/array_examples/array_am.rs
index 72d29de3..a9a0ceb1 100644
--- a/examples/array_examples/array_am.rs
+++ b/examples/array_examples/array_am.rs
@@ -32,7 +32,7 @@ impl LamellarAM for RdmaAM {
         });
 
         //get the original nodes data
-        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
         local_slice[ARRAY_LEN - 1] = num_pes as u8;
         unsafe {
@@ -65,9 +65,9 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     println!("creating array");
-    let array = UnsafeArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let array = UnsafeArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
     println!("creating memregion");
-    let local_mem_region = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let local_mem_region = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     println!("about to initialize array");
     array.print();
     if my_pe == 0 {
diff --git a/examples/array_examples/array_batch_add.rs b/examples/array_examples/array_batch_add.rs
index ab054043..f4891d3f 100644
--- a/examples/array_examples/array_batch_add.rs
+++ b/examples/array_examples/array_batch_add.rs
@@ -28,7 +28,7 @@ fn main() {
         let num_pes = world.num_pes();
         let my_pe = world.my_pe();
         let array_size = 1000000;
-        let array = AtomicArray::<usize>::new(world.clone(), array_size, Distribution::Block); //non intrinsic atomic, non bitwise
+        let array = AtomicArray::<usize>::new(world.clone(), array_size, Distribution::Block).block(); //non intrinsic atomic, non bitwise
                                                                                                //create vec of random indices between 0 & 1000000
         let mut rng = rand::thread_rng();
         let indices = (0..10_000_000)
diff --git a/examples/array_examples/array_consumer_schedules.rs b/examples/array_examples/array_consumer_schedules.rs
index 91ce1048..cb1f1d7a 100644
--- a/examples/array_examples/array_consumer_schedules.rs
+++ b/examples/array_examples/array_consumer_schedules.rs
@@ -295,7 +295,7 @@ fn main() {
     let _my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let block_array =
-        AtomicArray::<usize>::new(world.team(), ARRAY_LEN * num_pes, Distribution::Block);
+        AtomicArray::<usize>::new(world.team(), ARRAY_LEN * num_pes, Distribution::Block).block();
     println!("array created");
     block_array.print();
     let _ = block_array
diff --git a/examples/array_examples/array_first_last_global_indices.rs b/examples/array_examples/array_first_last_global_indices.rs
index 883d4397..e3033efc 100644
--- a/examples/array_examples/array_first_last_global_indices.rs
+++ b/examples/array_examples/array_first_last_global_indices.rs
@@ -5,7 +5,7 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
 
-    let array = ReadOnlyArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let array = ReadOnlyArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
     if my_pe == 0 {
         println!("Block Full Array");
         for pe in 0..array.num_pes() {
@@ -24,7 +24,7 @@ fn main() {
         }
     }
 
-    let array = ReadOnlyArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
+    let array = ReadOnlyArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
     if my_pe == 0 {
         println!("Cyclic Full Array");
         for pe in 0..array.num_pes() {
diff --git a/examples/array_examples/array_ops.rs b/examples/array_examples/array_ops.rs
index 61763bf5..1ce43af2 100644
--- a/examples/array_examples/array_ops.rs
+++ b/examples/array_examples/array_ops.rs
@@ -460,11 +460,11 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let num_pes = world.num_pes();
     let my_pe = world.my_pe();
-    let array_f64 = AtomicArray::<f64>::new(world.clone(), num_pes * 10, Distribution::Block); //non intrinsic atomic, non bitwise
-    let array_u8 = AtomicArray::<u8>::new(world.clone(), num_pes * 10, Distribution::Block); //intrinsic atomic,  bitwise
-    let array_i128 = AtomicArray::<i128>::new(world.clone(), num_pes * 10, Distribution::Block); //non intrinsic atomic,  bitwise
-    let array_custom = AtomicArray::<Custom>::new(world.clone(), num_pes * 10, Distribution::Block); //non intrinsic atomic, non bitwise
-    let _array_bool = AtomicArray::<bool>::new(world.clone(), num_pes * 10, Distribution::Block);
+    let array_f64 = AtomicArray::<f64>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic, non bitwise
+    let array_u8 = AtomicArray::<u8>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //intrinsic atomic,  bitwise
+    let array_i128 = AtomicArray::<i128>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic,  bitwise
+    let array_custom = AtomicArray::<Custom>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic, non bitwise
+    let _array_bool = AtomicArray::<bool>::new(world.clone(), num_pes * 10, Distribution::Block).block();
 
     println!("ADD-----------------------");
     test_add(array_f64.clone(), 0.0, 1.0);
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index 85c38674..44c62f05 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -27,10 +27,19 @@ fn main() {
             .and_then(|s| s.parse::<usize>().ok())
             .unwrap_or_else(|| 100);
 
-        let block_array = UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Block);
-        let cyclic_array = UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Cyclic);
-        let shared_mem_region = world.alloc_shared_mem_region(total_len).into(); //Convert into abstract LamellarMemoryRegion
-        let local_mem_region = world.alloc_one_sided_mem_region(total_len).into();
+        let block_array =
+            UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Block).await;
+        let cyclic_array =
+            UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Cyclic).await;
+        let shared_mem_region = world
+            .alloc_shared_mem_region(total_len)
+            .await
+            .expect("Enough memory should exist")
+            .into(); //Convert into abstract LamellarMemoryRegion
+        let local_mem_region = world
+            .alloc_one_sided_mem_region(total_len)
+            .expect("Enough memory should exist")
+            .into();
         initialize_array(&block_array).await;
         initialize_array(&cyclic_array).await;
         initialize_mem_region(&shared_mem_region);
diff --git a/examples/array_examples/atomic_compare_exchange.rs b/examples/array_examples/atomic_compare_exchange.rs
index 452168b2..0aab9b41 100644
--- a/examples/array_examples/atomic_compare_exchange.rs
+++ b/examples/array_examples/atomic_compare_exchange.rs
@@ -23,7 +23,7 @@ fn main() {
     let num_pes = world.num_pes();
     let my_pe = world.my_pe();
 
-    let array = AtomicArray::<usize>::new(world.team(), num_pes * 2, Distribution::Block);
+    let array = AtomicArray::<usize>::new(world.team(), num_pes * 2, Distribution::Block).block();
     array.dist_iter_mut().for_each(|x| x.store(0)).block(); //initialize array -- use atomic store
     array.barrier();
 
@@ -44,7 +44,7 @@ fn main() {
     array.barrier();
     array.print();
 
-    let array_2 = AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic);
+    let array_2 = AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic).block();
     array_2.dist_iter_mut().for_each(|x| x.store(0.0)).block();
     array_2.barrier();
 
diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index bbcf5235..277d8ceb 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -36,9 +36,11 @@ fn main() {
     let len_per_pe = total_len as f32 / num_pes as f32;
     let my_local_size = len_per_pe.round() as usize; //((len_per_pe * (my_pe+1) as f32).round() - (len_per_pe * my_pe as f32).round()) as usize;
     println!("my local size {:?}", my_local_size);
-    let block_array = UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Block);
-    let cyclic_array = UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Cyclic);
-    let local_mem_region = world.alloc_one_sided_mem_region(total_len);
+    let block_array =
+        UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Block).block();
+    let cyclic_array =
+        UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Cyclic).block();
+    let local_mem_region = world.alloc_one_sided_mem_region(total_len).expect("Enough memory should exist");
     world.barrier();
     if my_pe == 0 {
         unsafe {
@@ -152,7 +154,7 @@ fn main() {
     let block_array = block_array.into_read_only();
     let _ = block_array.sum().block();
 
-    let one_elem_array = UnsafeArray::<usize>::new(world.team(), 1, Distribution::Block);
+    let one_elem_array = UnsafeArray::<usize>::new(world.team(), 1, Distribution::Block).block();
     let min = unsafe { one_elem_array.min() };
     let min = one_elem_array.block_on(min);
     println!("one elem array min: {min:?}");
diff --git a/examples/array_examples/distributed_iteration.rs b/examples/array_examples/distributed_iteration.rs
index 1209bc16..4c021976 100644
--- a/examples/array_examples/distributed_iteration.rs
+++ b/examples/array_examples/distributed_iteration.rs
@@ -5,8 +5,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let _num_pes = world.num_pes();
-    let block_array = LocalLockArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
+    let block_array = LocalLockArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
+    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
 
     // We expose multiple ways to iterate over a lamellar array
     // the first approach introduces what we call a distributed iterator (inspired by Rayon's parallel iterators).
diff --git a/examples/array_examples/generic_array.rs b/examples/array_examples/generic_array.rs
index a9fae5d1..24703ac8 100644
--- a/examples/array_examples/generic_array.rs
+++ b/examples/array_examples/generic_array.rs
@@ -7,7 +7,7 @@ struct ArrayWrapper<T: Dist> {
 impl<T: Dist + ArrayOps + Default> ArrayWrapper<T> {
     fn new(world: LamellarWorld, len: usize) -> Self {
         ArrayWrapper {
-            _array: AtomicArray::<T>::new(world, len, Distribution::Block),
+            _array: AtomicArray::<T>::new(world, len, Distribution::Block).block(),
         }
     }
 }
diff --git a/examples/array_examples/global_lock_array.rs b/examples/array_examples/global_lock_array.rs
index 6b86f4e4..1d628601 100644
--- a/examples/array_examples/global_lock_array.rs
+++ b/examples/array_examples/global_lock_array.rs
@@ -6,7 +6,7 @@ fn main() {
     let my_pe = world.my_pe();
     let _num_pes = world.num_pes();
 
-    let array = GlobalLockArray::<usize>::new(&world, 100, Distribution::Block);
+    let array = GlobalLockArray::<usize>::new(&world, 100, Distribution::Block).block();
 
     let s = Instant::now();
     let local_data = array.read_local_data().block();
diff --git a/examples/array_examples/histo.rs b/examples/array_examples/histo.rs
index 53223849..0eb5da8e 100644
--- a/examples/array_examples/histo.rs
+++ b/examples/array_examples/histo.rs
@@ -9,7 +9,7 @@ const NUM_UPDATES_PER_PE: usize = 100000;
 
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
-    let array = AtomicArray::<usize>::new(&world, ARRAY_SIZE, lamellar::Distribution::Block);
+    let array = AtomicArray::<usize>::new(&world, ARRAY_SIZE, lamellar::Distribution::Block).block();
     let mut rng: StdRng = SeedableRng::seed_from_u64(world.my_pe() as u64);
     let range = rand::distributions::Uniform::new(0, ARRAY_SIZE);
 
diff --git a/examples/array_examples/local_iteration.rs b/examples/array_examples/local_iteration.rs
index c6b8edaf..13870dc2 100644
--- a/examples/array_examples/local_iteration.rs
+++ b/examples/array_examples/local_iteration.rs
@@ -5,8 +5,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let _num_pes = world.num_pes();
-    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
+    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
+    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
 
     // We expose multiple ways to iterate over a lamellar array
     // the first approach introduces what we call a distributed iterator (inspired by Rayon's parallel iterators).
diff --git a/examples/array_examples/onesided_iteration.rs b/examples/array_examples/onesided_iteration.rs
index f38a6b55..36e75687 100644
--- a/examples/array_examples/onesided_iteration.rs
+++ b/examples/array_examples/onesided_iteration.rs
@@ -7,8 +7,10 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
-    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic);
+    let block_array =
+        AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
+    let cyclic_array =
+        AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
 
     //we are going to initialize the data on each PE by directly accessing its local data
 
@@ -105,7 +107,7 @@ fn main() {
 
     println!("--------------------------------------------------------");
 
-    // let block_array = UnsafeArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    // let block_array = UnsafeArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
     // for elem in block_onesided_iter!($array,array).into_iter().step_by(4) {...}
     // for elem in block_array.buffered_onesided_iter(10) {...}
 
@@ -122,7 +124,7 @@ fn main() {
 
     // fn main() {
     //     let world = LamellarWorldBuilder::new().build();
-    //     let array = LocalLockArray::<usize>::new(&world, 8, Distribution::Block);
+    //     let array = LocalLockArray::<usize>::new(&world, 8, Distribution::Block).block();
     //     let my_pe = world.my_pe();
     //     let num_pes = world.num_pes();
     let block_array = block_array.into_local_lock();
diff --git a/examples/bandwidths/am_bw_get.rs b/examples/bandwidths/am_bw_get.rs
index aa26857e..5b3e599c 100644
--- a/examples/bandwidths/am_bw_get.rs
+++ b/examples/bandwidths/am_bw_get.rs
@@ -25,7 +25,7 @@ impl LamellarAM for DataAM {
     async fn exec(&self) {
         unsafe {
             // let local = lamellar::team.local_array::<u8>(self.length, 255u8);
-            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length);
+            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length).expect("Enough memory should exist");
             let local_slice = local.as_mut_slice().unwrap();
             local_slice[self.length - 1] = 255u8;
             self.array.get_unchecked(self.index, local.clone());
@@ -42,8 +42,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/am_group_bw_get.rs b/examples/bandwidths/am_group_bw_get.rs
index 2bdc358e..3bc2e768 100644
--- a/examples/bandwidths/am_group_bw_get.rs
+++ b/examples/bandwidths/am_group_bw_get.rs
@@ -25,7 +25,7 @@ impl LamellarAM for DataAM {
     async fn exec(&self) {
         unsafe {
             // let local = lamellar::team.local_array::<u8>(self.length, 255u8);
-            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length);
+            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length).expect("Enough memory should exist");
             let local_slice = local.as_mut_slice().unwrap();
             local_slice[self.length - 1] = 255u8;
             self.array.get_unchecked(self.index, local.clone());
@@ -42,8 +42,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/atomic_array_get_bw.rs b/examples/bandwidths/atomic_array_get_bw.rs
index 3484f426..8b81978d 100644
--- a/examples/bandwidths/atomic_array_get_bw.rs
+++ b/examples/bandwidths/atomic_array_get_bw.rs
@@ -13,8 +13,8 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: LocalLockArray<u8> =
-        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/atomic_array_put_bw.rs b/examples/bandwidths/atomic_array_put_bw.rs
index f30a2f88..f43c539a 100644
--- a/examples/bandwidths/atomic_array_put_bw.rs
+++ b/examples/bandwidths/atomic_array_put_bw.rs
@@ -13,8 +13,8 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: LocalLockArray<u8> =
-        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/get_bw.rs b/examples/bandwidths/get_bw.rs
index 5b2c5185..50f08725 100644
--- a/examples/bandwidths/get_bw.rs
+++ b/examples/bandwidths/get_bw.rs
@@ -12,8 +12,11 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let mem_reg = world.alloc_shared_mem_region::<u8>(MEMREG_LEN);
-    let data = world.alloc_one_sided_mem_region::<u8>(MEMREG_LEN);
+    let mem_reg = world
+        .alloc_shared_mem_region::<u8>(MEMREG_LEN)
+        .block()
+        .unwrap();
+    let data = world.alloc_one_sided_mem_region::<u8>(MEMREG_LEN).expect("Enough memory should exist");
     for j in 0..MEMREG_LEN as usize {
         unsafe {
             data.as_mut_slice().unwrap()[j] = my_pe as u8;
diff --git a/examples/bandwidths/global_lock_atomic_array_get_bw.rs b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
index 029f9d97..c4825af5 100644
--- a/examples/bandwidths/global_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
@@ -14,8 +14,8 @@ fn main() {
     let num_pes = world.num_pes();
 
     let array: GlobalLockArray<u8> =
-        GlobalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        GlobalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index 248b57f7..d62cc46f 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -14,8 +14,8 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: GlobalLockArray<u8> =
-        GlobalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        GlobalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/local_lock_atomic_array_get_bw.rs b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
index d836f6a2..0616cd25 100644
--- a/examples/bandwidths/local_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
@@ -14,8 +14,8 @@ fn main() {
     let num_pes = world.num_pes();
 
     let array: LocalLockArray<u8> =
-        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 1b857e1f..83463cc7 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -14,8 +14,8 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: LocalLockArray<u8> =
-        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/put_bw.rs b/examples/bandwidths/put_bw.rs
index 5cd4d1cd..4fc58b65 100644
--- a/examples/bandwidths/put_bw.rs
+++ b/examples/bandwidths/put_bw.rs
@@ -11,8 +11,13 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array = world
+        .alloc_shared_mem_region::<u8>(ARRAY_LEN)
+        .block()
+        .unwrap();
+    let data = world
+        .alloc_one_sided_mem_region::<u8>(ARRAY_LEN)
+        .expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/readonly_array_get_bw.rs b/examples/bandwidths/readonly_array_get_bw.rs
index f918a37d..fc200b05 100644
--- a/examples/bandwidths/readonly_array_get_bw.rs
+++ b/examples/bandwidths/readonly_array_get_bw.rs
@@ -13,8 +13,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/readonly_array_get_unchecked_bw.rs b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
index c63ad12c..f8a39053 100644
--- a/examples/bandwidths/readonly_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
@@ -12,8 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_get_bw.rs b/examples/bandwidths/unsafe_array_get_bw.rs
index 5f379915..7b99bb43 100644
--- a/examples/bandwidths/unsafe_array_get_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_bw.rs
@@ -13,8 +13,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
index 624ee25f..448160d0 100644
--- a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
@@ -12,8 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_put_bw.rs b/examples/bandwidths/unsafe_array_put_bw.rs
index 80354082..b4cc0212 100644
--- a/examples/bandwidths/unsafe_array_put_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_bw.rs
@@ -12,8 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_put_unchecked_bw.rs b/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
index 77340672..c8d425f8 100644
--- a/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
@@ -12,8 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_store_bw.rs b/examples/bandwidths/unsafe_array_store_bw.rs
index 0a8c007f..c6466855 100644
--- a/examples/bandwidths/unsafe_array_store_bw.rs
+++ b/examples/bandwidths/unsafe_array_store_bw.rs
@@ -13,8 +13,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block);
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/darc_examples/darc.rs b/examples/darc_examples/darc.rs
index dbfffdce..a93fe2df 100644
--- a/examples/darc_examples/darc.rs
+++ b/examples/darc_examples/darc.rs
@@ -55,7 +55,7 @@ fn main() {
         (num_pes as f64 / 2.0).ceil() as usize, //num pes in team
     ));
 
-    let global_darc = GlobalRwDarc::new(world.team(), 0).unwrap();
+    let global_darc = GlobalRwDarc::new(world.team(), 0).block().unwrap();
     let read_lock = global_darc.read().block();
     println!("I have the read lock!!!! {:?}", my_pe);
     drop(read_lock);
@@ -64,21 +64,21 @@ fn main() {
     std::thread::sleep(std::time::Duration::from_secs(1));
     drop(write_lock);
     //----
-    let local_darc = LocalRwDarc::new(world.team(), 10).unwrap();
+    let local_darc = LocalRwDarc::new(world.team(), 10).block().unwrap();
     println!("created new local rw");
     // local_darc.print();
 
     let wrapped = WrappedWrappedWrappedDarc {
         wrapped: WrappedWrappedDarc {
             wrapped: WrappedDarc {
-                wrapped: Darc::new(world.team(), 3).unwrap(),
+                wrapped: Darc::new(world.team(), 3).block().unwrap(),
             },
         },
     };
-    let darc1 = Darc::new(world.team(), 10).unwrap();
-    let darc2 = Darc::new(world.team(), 20).unwrap();
+    let darc1 = Darc::new(world.team(), 10).block().unwrap();
+    let darc2 = Darc::new(world.team(), 20).block().unwrap();
     if let Some(team) = even_team {
-        let team_darc = Darc::new(team.clone(), AtomicUsize::new(10));
+        let team_darc = Darc::new(team.clone(), AtomicUsize::new(10)).block();
         let mut tg = typed_am_group!(DarcAm, team.clone());
         println!("{:?} created team darc", std::thread::current().id());
         if let Ok(team_darc) = team_darc {
@@ -92,7 +92,7 @@ fn main() {
                 wrapped: wrapped.clone(),
                 wrapped_tuple: (wrapped.clone(), wrapped.clone()),
                 darc_tuple: (darc1.clone(), darc2.clone()),
-                my_arc: Darc::new(team.clone(), Arc::new(0)).unwrap(),
+                my_arc: Darc::new(team.clone(), Arc::new(0)).block().unwrap(),
             };
             let _ = team.exec_am_pe(0, darc_am.clone()).spawn();
             let _ = team.exec_am_all(darc_am.clone()).spawn();
diff --git a/examples/darc_examples/stress_test.rs b/examples/darc_examples/stress_test.rs
index 44c17bc5..a05b1b8b 100644
--- a/examples/darc_examples/stress_test.rs
+++ b/examples/darc_examples/stress_test.rs
@@ -50,7 +50,7 @@ fn main() {
 
     let mut rng = rand::thread_rng();
     let pes = Uniform::from(0..num_pes);
-    let darc = Darc::new(&world, AtomicUsize::new(0)).unwrap();
+    let darc = Darc::new(&world, AtomicUsize::new(0)).block().unwrap();
     let width = 10;
     let s = Instant::now();
     let mut tg = typed_am_group!(DataAM, &world);
diff --git a/examples/darc_examples/string_darc.rs b/examples/darc_examples/string_darc.rs
index 0092128f..ae6b656d 100644
--- a/examples/darc_examples/string_darc.rs
+++ b/examples/darc_examples/string_darc.rs
@@ -20,8 +20,9 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     world.clone().block_on(async move {
-        let string_data =
-            LocalRwDarc::new(&world, format!("Orig String on PE: {}", my_pe)).unwrap();
+        let string_data = LocalRwDarc::new(&world, format!("Orig String on PE: {}", my_pe))
+            .await
+            .unwrap();
 
         println!("[PE: {}] {}", my_pe, string_data.read().await);
 
diff --git a/examples/hello_world/hello_world_array.rs b/examples/hello_world/hello_world_array.rs
index 1d0620b6..25ad60af 100644
--- a/examples/hello_world/hello_world_array.rs
+++ b/examples/hello_world/hello_world_array.rs
@@ -20,7 +20,7 @@ fn main() {
     println!("local_vec_time: {:?}", local_vec_time);
 
     let timer = std::time::Instant::now();
-    let array = AtomicArray::<usize>::new(world.team(), global_length, Distribution::Block);
+    let array = AtomicArray::<usize>::new(world.team(), global_length, Distribution::Block).block();
     let array_time = timer.elapsed();
     println!("array_time: {:?}", array_time);
 
diff --git a/examples/hello_world/hello_world_array_iteration.rs b/examples/hello_world/hello_world_array_iteration.rs
index 3857ecd2..9bb8bb3f 100644
--- a/examples/hello_world/hello_world_array_iteration.rs
+++ b/examples/hello_world/hello_world_array_iteration.rs
@@ -11,7 +11,7 @@ fn main() {
     let local_length = 10; //if you want to ensure each thread processes data make this >= LAMELLAR_THREADS environment variable
     let global_length = num_pes * local_length;
 
-    let array = AtomicArray::<usize>::new(world.team(), global_length, Distribution::Block); //Compare with Distribution::Cyclic
+    let array = AtomicArray::<usize>::new(world.team(), global_length, Distribution::Block).block(); //Compare with Distribution::Cyclic
 
     //examine array before initialization
     if my_pe == 0 {
diff --git a/examples/kernels/am_gemm.rs b/examples/kernels/am_gemm.rs
index f9150a35..61396fcd 100644
--- a/examples/kernels/am_gemm.rs
+++ b/examples/kernels/am_gemm.rs
@@ -97,8 +97,8 @@ struct NaiveMM {
 #[lamellar::am]
 impl LamellarAM for NaiveMM {
     async fn exec() {
-        let a = lamellar::world.alloc_one_sided_mem_region(self.a.block_size * self.a.block_size); //the tile for the A matrix
-        let b = lamellar::world.alloc_one_sided_mem_region(self.b.block_size * self.b.block_size); //the tile for the B matrix
+        let a = lamellar::world.alloc_one_sided_mem_region(self.a.block_size * self.a.block_size).expect("Enough memory should exist"); //the tile for the A matrix
+        let b = lamellar::world.alloc_one_sided_mem_region(self.b.block_size * self.b.block_size).expect("Enough memory should exist"); //the tile for the B matrix
         let b_fut = get_sub_mat(&self.b, &b); //b is remote so we will launch "gets" for this data first
         let a_fut = get_sub_mat(&self.a, &a);
         let a_b_fut = future::join(a_fut, b_fut);
@@ -162,9 +162,18 @@ fn main() {
     let n = dim; // a cols b rows
     let p = dim; // b & c cols
 
-    let a = world.alloc_shared_mem_region::<f32>((m * n) / num_pes);
-    let b = world.alloc_shared_mem_region::<f32>((n * p) / num_pes);
-    let c = world.alloc_shared_mem_region::<f32>((m * p) / num_pes);
+    let a = world
+        .alloc_shared_mem_region::<f32>((m * n) / num_pes)
+        .block()
+        .unwrap();
+    let b = world
+        .alloc_shared_mem_region::<f32>((n * p) / num_pes)
+        .block()
+        .unwrap();
+    let c = world
+        .alloc_shared_mem_region::<f32>((m * p) / num_pes)
+        .block()
+        .unwrap();
     unsafe {
         let mut cnt = (((m * n) / num_pes) * my_pe) as f32;
         for elem in a.as_mut_slice().unwrap() {
diff --git a/examples/kernels/cached_am_gemm.rs b/examples/kernels/cached_am_gemm.rs
index ce415a97..90473df9 100644
--- a/examples/kernels/cached_am_gemm.rs
+++ b/examples/kernels/cached_am_gemm.rs
@@ -104,7 +104,8 @@ struct MatMulAM {
 impl LamellarAM for MatMulAM {
     async fn exec() {
         let b = lamellar::world
-            .alloc_one_sided_mem_region::<f32>(self.b.block_size * self.b.block_size);
+            .alloc_one_sided_mem_region::<f32>(self.b.block_size * self.b.block_size)
+            .expect("enough memory exists");
         get_sub_mat(&self.b, &b).await;
         // we dont actually want to alloc a shared memory region as there is an implicit barrier here
         // introduces sync point and potential for deadlock
@@ -119,8 +120,9 @@ impl LamellarAM for MatMulAM {
             a.row_block = row;
             let mut c = self.c.clone();
             c.row_block = row;
-            let sub_a =
-                lamellar::world.alloc_one_sided_mem_region::<f32>(a.block_size * a.block_size);
+            let sub_a = lamellar::world
+                .alloc_one_sided_mem_region::<f32>(a.block_size * a.block_size)
+                .expect("enough memory exists");
             get_sub_mat(&a, &sub_a).await; //this should be local copy so returns immediately
             do_gemm(&sub_a, &b, c, self.block_size);
         }
@@ -174,9 +176,18 @@ fn main() {
     let n = dim; // a cols b rows
     let p = dim; // b & c cols
 
-    let a = world.alloc_shared_mem_region::<f32>((m * n) / num_pes);
-    let b = world.alloc_shared_mem_region::<f32>((n * p) / num_pes);
-    let c = world.alloc_shared_mem_region::<f32>((m * p) / num_pes);
+    let a = world
+        .alloc_shared_mem_region::<f32>((m * n) / num_pes)
+        .block()
+        .expect("enough memory exists");
+    let b = world
+        .alloc_shared_mem_region::<f32>((n * p) / num_pes)
+        .block()
+        .expect("enough memory exists");
+    let c = world
+        .alloc_shared_mem_region::<f32>((m * p) / num_pes)
+        .block()
+        .expect("enough memory exists");
     // let c2 = world.alloc_shared_mem_region::<f32>((m * p) / num_pes);
     unsafe {
         let mut cnt = my_pe as f32 * ((m * n) / num_pes) as f32;
@@ -245,13 +256,17 @@ fn main() {
                     j,
                     block_size,
                 );
-                reqs.push(world.exec_am_local(MatMulAM {
-                    a: a_block,
-                    b: b_block,
-                    c: c_block.clone(),
-                    a_pe_rows: a_pe_rows,
-                    block_size: block_size,
-                }).spawn());
+                reqs.push(
+                    world
+                        .exec_am_local(MatMulAM {
+                            a: a_block,
+                            b: b_block,
+                            c: c_block.clone(),
+                            a_pe_rows: a_pe_rows,
+                            block_size: block_size,
+                        })
+                        .spawn(),
+                );
                 tasks += 1;
             }
             // for req in reqs {
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index 3980f7ad..a3b1fb25 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -143,7 +143,10 @@ fn dft_lamellar(
     spectrum: SharedMemoryRegion<f64>,
 ) -> f64 {
     let spectrum_slice = unsafe { spectrum.as_slice().unwrap() };
-    let add_spec = world.alloc_shared_mem_region::<f64>(spectrum_slice.len());
+    let add_spec = world
+        .alloc_shared_mem_region::<f64>(spectrum_slice.len())
+        .block()
+        .unwrap();
 
     let timer = Instant::now();
     for pe in 0..num_pes {
@@ -634,17 +637,26 @@ fn main() {
         let global_len = num_pes * array_len;
 
         println!("my_pe {:?} num_pes {:?}", my_pe, num_pes);
-        let partial_sum = world.alloc_shared_mem_region::<f64>(num_pes);
-        let partial_spectrum = world.alloc_shared_mem_region::<f64>(array_len);
-        let partial_signal = world.alloc_shared_mem_region::<f64>(array_len);
-        let full_signal = world.alloc_one_sided_mem_region::<f64>(global_len);
-        let full_spectrum = world.alloc_one_sided_mem_region::<f64>(global_len);
-        let magic = world.alloc_one_sided_mem_region::<f64>(num_pes);
+        let partial_sum = world
+            .alloc_shared_mem_region::<f64>(num_pes)
+            .block()
+            .expect("Enough memory should exist");
+        let partial_spectrum = world
+            .alloc_shared_mem_region::<f64>(array_len)
+            .block()
+            .expect("Enough memory should exist");
+        let partial_signal = world
+            .alloc_shared_mem_region::<f64>(array_len)
+            .block()
+            .expect("Enough memory should exist");
+        let full_signal = world.alloc_one_sided_mem_region::<f64>(global_len).expect("Enough memory should exist");
+        let full_spectrum = world.alloc_one_sided_mem_region::<f64>(global_len).expect("Enough memory should exist");
+        let magic = world.alloc_one_sided_mem_region::<f64>(num_pes).expect("Enough memory should exist");
 
         let full_spectrum_array =
-            UnsafeArray::<f64>::new(world.team(), global_len, Distribution::Block);
+            UnsafeArray::<f64>::new(world.team(), global_len, Distribution::Block).block();
         let full_signal_array =
-            UnsafeArray::<f64>::new(world.team(), global_len, Distribution::Block);
+            UnsafeArray::<f64>::new(world.team(), global_len, Distribution::Block).block();
 
         unsafe {
             for i in full_signal.as_mut_slice().unwrap() {
diff --git a/examples/kernels/parallel_array_gemm.rs b/examples/kernels/parallel_array_gemm.rs
index 2ec9b60c..871f6f7d 100644
--- a/examples/kernels/parallel_array_gemm.rs
+++ b/examples/kernels/parallel_array_gemm.rs
@@ -28,9 +28,9 @@ fn main() {
 
     println!("m: {}, n: {}, p: {}", m, n, p);
 
-    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block); //row major
-    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block); //col major
-    let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
+    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block).block(); //row major
+    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block).block(); //col major
+    let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block).block(); //row major
 
     //initialize matrices
     a.dist_iter_mut()
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index 94f5f9dd..382eee93 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -36,9 +36,9 @@ fn main() {
     let n = dim; // a cols b rows
     let p = dim; // b & c cols
 
-    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block); //row major
-    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block); //col major
-    let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
+    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block).block(); //row major
+    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block).block(); //col major
+    let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block).block(); //row major
                                                                          //initialize
     a.dist_iter_mut()
         .enumerate()
@@ -75,7 +75,7 @@ fn main() {
     // we construct a global array where each pe will contain the sequence (0..n_blks)
     // we can then call dist_iter() on this array to iterate over the range in parallel on each PE
     let nblks_array =
-        LocalLockArray::<Block>::new(&world, (n_blks * n_blks) * num_pes, Distribution::Block);
+        LocalLockArray::<Block>::new(&world, (n_blks * n_blks) * num_pes, Distribution::Block).block();
 
     nblks_array
         .dist_iter_mut()
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index e8847bc3..98fe0d0f 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -29,9 +29,9 @@ fn main() {
     let n = dim; // a cols b rows
     let p = dim; // b & c cols
 
-    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block); //row major -- we will change this into a readonly array after initialization
-    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block); //col major -- we will change this into a readonly array after initialization
-    let c = LocalLockArray::<f32>::new(&world, m * p, Distribution::Block); //row major
+    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block).block(); //row major -- we will change this into a readonly array after initialization
+    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block).block(); //col major -- we will change this into a readonly array after initialization
+    let c = LocalLockArray::<f32>::new(&world, m * p, Distribution::Block).block(); //row major
                                                                             //initialize
     let a_init = a
         .dist_iter_mut()
@@ -68,7 +68,7 @@ fn main() {
     // this is a "hack" until we support something like (0..n_blks).dist_iter()
     // we construct a global array where each pe will contain the sequence (0..n_blks)
     // we can then call dist_iter() on this array to iterate over the range in parallel on each PE
-    let nblks_array = LocalLockArray::new(&world, n_blks * num_pes, Distribution::Block);
+    let nblks_array = LocalLockArray::new(&world, n_blks * num_pes, Distribution::Block).block();
 
     nblks_array
         .dist_iter_mut()
@@ -76,7 +76,7 @@ fn main() {
         .for_each(move |(i, x)| *x = i % n_blks)
         .block();
 
-    let m_blks_pe_array = LocalLockArray::new(&world, m_blks_pe * num_pes, Distribution::Block);
+    let m_blks_pe_array = LocalLockArray::new(&world, m_blks_pe * num_pes, Distribution::Block).block();
 
     m_blks_pe_array
         .dist_iter_mut()
diff --git a/examples/kernels/serial_array_gemm.rs b/examples/kernels/serial_array_gemm.rs
index fd669ef4..5016596b 100644
--- a/examples/kernels/serial_array_gemm.rs
+++ b/examples/kernels/serial_array_gemm.rs
@@ -23,9 +23,9 @@ fn main() {
     let n = dim; // a cols b rows
     let p = dim; // b & c cols
 
-    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block); //row major
-    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block); //col major
-    let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block); //row major
+    let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block).block(); //row major
+    let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block).block(); //col major
+    let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block).block(); //row major
                                                                          //initialize matrices
 
     a.dist_iter_mut()
diff --git a/examples/misc/dist_hashmap.rs b/examples/misc/dist_hashmap.rs
index f442a1f4..878b4a9f 100644
--- a/examples/misc/dist_hashmap.rs
+++ b/examples/misc/dist_hashmap.rs
@@ -19,7 +19,7 @@ impl DistHashMap {
         DistHashMap {
             num_pes,
             team: team.clone(),
-            data: LocalRwDarc::new(team, HashMap::new()).unwrap(),
+            data: LocalRwDarc::new(team, HashMap::new()).block().unwrap(),
         }
     }
 
diff --git a/examples/misc/lamellar_env.rs b/examples/misc/lamellar_env.rs
index 21d4974c..7136b171 100644
--- a/examples/misc/lamellar_env.rs
+++ b/examples/misc/lamellar_env.rs
@@ -14,10 +14,10 @@ fn print_env<T: LamellarEnv>(env: &T) {
 
 fn main() {
     let world = LamellarWorldBuilder::new().build();
-    let darc = Darc::new(&world, 0).unwrap();
-    let lrw_darc = LocalRwDarc::new(&world, 0).unwrap();
-    let grw_darc = GlobalRwDarc::new(&world, 0).unwrap();
-    let array = UnsafeArray::<u8>::new(world.clone(), 10, Distribution::Block);
+    let darc = Darc::new(&world, 0).block().unwrap();
+    let lrw_darc = LocalRwDarc::new(&world, 0).block().unwrap();
+    let grw_darc = GlobalRwDarc::new(&world, 0).block().unwrap();
+    let array = UnsafeArray::<u8>::new(world.clone(), 10, Distribution::Block).block();
     let team = world
         .create_team_from_arch(StridedArch::new(0, 2, world.num_pes() / 2))
         .unwrap();
diff --git a/examples/misc/ping_pong.rs b/examples/misc/ping_pong.rs
index 74ae33f8..320e3b5c 100644
--- a/examples/misc/ping_pong.rs
+++ b/examples/misc/ping_pong.rs
@@ -326,16 +326,30 @@ fn main() {
 
     let buffer_size = 16384 * 2;
 
-    let indices =
-        world.alloc_shared_mem_region::<usize>(UPDATES_PER_CORE * world.num_threads_per_pe());
+    let indices = world
+        .alloc_shared_mem_region::<usize>(UPDATES_PER_CORE * world.num_threads_per_pe())
+        .block()
+        .unwrap();
 
-    let index_send_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    let index_send_buffers = world
+        .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
+        .block()
+        .unwrap();
     world.barrier();
-    let index_recv_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    let index_recv_buffers = world
+        .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
+        .block()
+        .unwrap();
     world.barrier();
-    let result_send_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    let result_send_buffers = world
+        .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
+        .block()
+        .unwrap();
     world.barrier();
-    let result_recv_buffers = world.alloc_shared_mem_region::<usize>(buffer_size * num_pes);
+    let result_recv_buffers = world
+        .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
+        .block()
+        .unwrap();
     world.barrier();
     let mut rng: StdRng = SeedableRng::seed_from_u64(my_pe as u64);
     let table_size_per_pe = 100000 * world.num_threads_per_pe();
diff --git a/examples/rdma_examples/rdma_am.rs b/examples/rdma_examples/rdma_am.rs
index 5747822a..7463b2da 100644
--- a/examples/rdma_examples/rdma_am.rs
+++ b/examples/rdma_examples/rdma_am.rs
@@ -32,7 +32,9 @@ impl LamellarAM for RdmaAM {
         }
 
         //get the original nodes data
-        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        let local = lamellar::world
+            .alloc_one_sided_mem_region::<u8>(ARRAY_LEN)
+            .expect("Enough memory should exist");
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
         local_slice[ARRAY_LEN - 1] = lamellar::num_pes as u8;
         unsafe {
@@ -66,7 +68,7 @@ impl LamellarAM for RdmaLocalMRAM {
         );
 
         //get the original nodes data
-        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
         local_slice[ARRAY_LEN - 1] = lamellar::num_pes as u8;
         unsafe {
@@ -105,8 +107,11 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN);
-    let local_array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let array = world
+        .alloc_shared_mem_region::<u8>(ARRAY_LEN)
+        .block()
+        .expect("Enough memory should exist");
+    let local_array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
     unsafe {
         for i in array.as_mut_slice().unwrap() {
             *i = 255_u8;
diff --git a/examples/rdma_examples/rdma_get.rs b/examples/rdma_examples/rdma_get.rs
index a91d2a26..26a27cea 100644
--- a/examples/rdma_examples/rdma_get.rs
+++ b/examples/rdma_examples/rdma_get.rs
@@ -19,12 +19,15 @@ fn main() {
     if num_pes > 1 {
         // instatiates a shared memory region on every PE in world
         // all other pes can put/get into this region
-        let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN);
+        let array = world
+            .alloc_shared_mem_region::<u8>(ARRAY_LEN)
+            .block()
+            .expect("Enough memory should exist");
         let array_slice = unsafe { array.as_slice().unwrap() }; //we can unwrap because we know array is local
                                                                 // instatiates a local array whos memory is registered with
                                                                 // the underlying network device, so that it can be used
                                                                 // as the src buffer in a put or as the dst buffer in a get
-        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
         let data_slice = unsafe { data.as_mut_slice().unwrap() }; //we can unwrap because we know data is local
         for elem in data_slice.iter_mut() {
             *elem = my_pe as u8;
diff --git a/examples/rdma_examples/rdma_put.rs b/examples/rdma_examples/rdma_put.rs
index 45d41574..87e94ba8 100644
--- a/examples/rdma_examples/rdma_put.rs
+++ b/examples/rdma_examples/rdma_put.rs
@@ -19,13 +19,13 @@ fn main() {
     if num_pes > 1 {
         // instatiates a shared memory region on every PE in world
         // all other pes can put/get into this region
-        let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN);
+        let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN).block().expect("Enough memory should exist");
         let array_slice = unsafe { array.as_slice().unwrap() }; //we can unwrap because we know array is local
 
         // instatiates a local array whos memory is registered with
         // the underlying network device, so that it can be used
         // as the src buffer in a put or as the dst buffer in a get
-        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
         let data_slice = unsafe { data.as_mut_slice().unwrap() }; //we can unwrap because we know data is local
         for elem in data_slice {
             *elem = my_pe as u8;
diff --git a/impl/src/lib.rs b/impl/src/lib.rs
index 4d37d669..1945cb9d 100644
--- a/impl/src/lib.rs
+++ b/impl/src/lib.rs
@@ -705,7 +705,7 @@ pub fn generate_ops_for_bool_rt(_item: TokenStream) -> TokenStream {
 ///     let world = LamellarWorldBuilder::new().build(); // the world
 ///     
 ///     let array =  // the atomic distributed array
-///         AtomicArray::<Custom>::new(&world,3,Distribution::Block);
+///         AtomicArray::<Custom>::new(&world,3,Distribution::Block).block();
 ///
 ///     println!();
 ///     println!("initialize a length-3 array:\n");  // print the entries
diff --git a/src/active_messaging.rs b/src/active_messaging.rs
index 23b13690..c8dab367 100644
--- a/src/active_messaging.rs
+++ b/src/active_messaging.rs
@@ -574,7 +574,7 @@
 //!     let world = lamellar::LamellarWorldBuilder::new().build();
 //!     let my_pe = world.my_pe();
 //!     let num_pes = world.num_pes();
-//!     let darc = Darc::new(&world,AtomicUsize::new(0)).expect("PE in world team");
+//!     let darc = Darc::new(&world,AtomicUsize::new(0)).block().expect("PE in world team");
 //!
 //!     if my_pe == 0 { // we only want to run this on PE0 for sake of illustration
 //!         let mut am_group = typed_am_group!{ExampleAm,&world};
diff --git a/src/array.rs b/src/array.rs
index e848379d..08cfda22 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -49,7 +49,7 @@
 //!
 //! // define an length-10 array of type UnsafeArray<usize>
 //! let world = LamellarWorldBuilder::new().build();
-//! let array =  UnsafeArray::<usize>::new(&world, 10,Distribution::Block);
+//! let array =  UnsafeArray::<usize>::new(&world, 10,Distribution::Block).block();
 //!
 //! // convert between array types    
 //! let array = array.into_local_lock(); // LocalLockArray
@@ -230,14 +230,14 @@ impl<T: Dist + ArrayOps> ArrayOps for Option<T> {}
 ///```
 /// use lamellar::array::prelude::*;
 /// let world = LamellarWorldBuilder::new().build();
-/// let block_array = AtomicArray::<usize>::new(world,12,Distribution::Block);
+/// let block_array = AtomicArray::<usize>::new(world,12,Distribution::Block).block();
 /// //block array index location  = PE0 [0,1,2,3],  PE1 [4,5,6,7],  PE2 [8,9,10,11], PE3 [12,13,14,15]
 ///```
 /// ## Cyclic
 ///```
 /// use lamellar::array::prelude::*;
 /// let world = LamellarWorldBuilder::new().build();
-/// let cyclic_array = AtomicArray::<usize>::new(world,12,Distribution::Cyclic);
+/// let cyclic_array = AtomicArray::<usize>::new(world,12,Distribution::Cyclic).block();
 /// //cyclic array index location = PE0 [0,4,8,12], PE1 [1,5,9,13], PE2 [2,6,10,14], PE3 [3,7,11,15]
 ///```
 #[derive(serde::Serialize, serde::Deserialize, Clone, Copy, Debug, Eq, PartialEq)]
@@ -302,7 +302,7 @@ impl<T: Dist> LamellarRead for &[T] {}
 impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
     /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
     fn team_from(val: &T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(1);
         unsafe {
             buf.as_mut_slice().expect("Data should exist on PE")[0] = val.clone();
         }
@@ -313,7 +313,7 @@ impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
     /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(1);
         unsafe {
             buf.as_mut_slice().expect("Data should exist on PE")[0] = val;
         }
@@ -324,7 +324,7 @@ impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -338,7 +338,7 @@ impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: &Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -352,7 +352,7 @@ impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<&[T]> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: &[T], team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -1045,7 +1045,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let a_team = array.team();
     ///```
@@ -1061,7 +1061,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     // ///```
     // /// use lamellar::array::prelude::*;
     // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     // ///
     // /// assert_eq!(world.my_pe(),array.my_pe());
     // ///```
@@ -1077,7 +1077,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     // ///```
     // /// use lamellar::array::prelude::*;
     // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     // ///
     // /// assert_eq!(world.num_pes(),array.num_pes());
     // ///```
@@ -1093,7 +1093,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// assert_eq!(100,array.len());
     ///```
@@ -1110,7 +1110,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```no_run //assert is for 4 PEs
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<u8> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<u8> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// assert_eq!(25,array.num_elems_local());
     ///```
@@ -1124,9 +1124,9 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block);
+    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
     ///```
     // fn use_distribution(self, distribution: Distribution) -> Self;
 
@@ -1140,7 +1140,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     // ///```
     // /// use lamellar::array::prelude::*;
     // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     // ///
     // /// array.barrier();
     // ///```
@@ -1158,7 +1158,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     // ///```
     // /// use lamellar::array::prelude::*;
     // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     // ///
     // /// for i in 0..100{
     // ///     array.add(i,1);
@@ -1181,7 +1181,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     // ///```
     // /// use lamellar::array::prelude::*;
     // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     // ///
     // /// let request = array.fetch_add(10,1000); //fetch index 10 and add 1000 to it
     // /// let result = array.block_on(request); //block until am has executed
@@ -1203,7 +1203,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let block_array: UnsafeArray<usize> = UnsafeArray::new(&world,16,Distribution::Block);
+    /// let block_array: UnsafeArray<usize> = UnsafeArray::new(&world,16,Distribution::Block).block();
     /// // block array index location  = PE0 [0,1,2,3],  PE1 [4,5,6,7],  PE2 [8,9,10,11], PE3 [12,13,14,15]
     /// let  Some((pe,offset)) = block_array.pe_and_offset_for_global_index(6) else { panic!("out of bounds");};
     /// assert_eq!((pe,offset) ,(1,2));
@@ -1213,7 +1213,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic);
+    /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic).block();
     /// // cyclic array index location = PE0 [0,4,8,12], PE1 [1,5,9,13], PE2 [2,6,10,14], PE3 [3,7,11,15]
     /// let Some((pe,offset)) = cyclic_array.pe_and_offset_for_global_index(6) else { panic!("out of bounds");};
     /// assert_eq!((pe,offset) ,(2,1));
@@ -1234,7 +1234,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let block_array: UnsafeArray<usize> = UnsafeArray::new(&world,16,Distribution::Block);
+    /// let block_array: UnsafeArray<usize> = UnsafeArray::new(&world,16,Distribution::Block).block();
     /// // block array index location  = PE0 [0,1,2,3],  PE1 [4,5,6,7],  PE2 [8,9,10,11], PE3 [12,13,14,15]
     /// let index = block_array.first_global_index_for_pe(0).unwrap();
     /// assert_eq!(index , 0);
@@ -1250,7 +1250,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic);
+    /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic).block();
     /// // cyclic array index location = PE0 [0,4,8,12], PE1 [1,5,9,13], PE2 [2,6,10,14], PE3 [3,7,11,15]
     /// let Some((pe,offset)) = cyclic_array.pe_and_offset_for_global_index(6) else { panic!("out of bounds");};
     /// let index = cyclic_array.first_global_index_for_pe(0).unwrap();
@@ -1278,7 +1278,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let block_array: UnsafeArray<usize> = UnsafeArray::new(&world,16,Distribution::Block);
+    /// let block_array: UnsafeArray<usize> = UnsafeArray::new(&world,16,Distribution::Block).block();
     /// // block array index location  = PE0 [0,1,2,3],  PE1 [4,5,6,7],  PE2 [8,9,10,11], PE3 [12,13,14,15]
     /// let index = block_array.last_global_index_for_pe(0).unwrap();
     /// assert_eq!(index , 3);
@@ -1294,7 +1294,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic);
+    /// let cyclic_array: UnsafeArray<usize> = UnsafeArray::new(world,16,Distribution::Cyclic).block();
     /// // cyclic array index location = PE0 [0,4,8,12], PE1 [1,5,9,13], PE2 [2,6,10,14], PE3 [3,7,11,15]
     /// let Some((pe,offset)) = cyclic_array.pe_and_offset_for_global_index(6) else { panic!("out of bounds");};
     /// let index = cyclic_array.last_global_index_for_pe(0).unwrap();
@@ -1359,7 +1359,7 @@ pub trait SubArray<T: Dist>: LamellarArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let sub_array = array.sub_array(25..75);
     ///```
@@ -1379,7 +1379,7 @@ pub trait SubArray<T: Dist>: LamellarArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let sub_array = array.sub_array(25..75);
     /// assert_eq!(25,sub_array.global_index(0));
@@ -1422,7 +1422,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// let _ = array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn(); //we will used this val as completion detection
     /// unsafe { // we just created buf and have not shared it so free to mutate safely
@@ -1485,7 +1485,7 @@ pub trait LamellarArrayGet<T: Dist>: LamellarArrayInternalGet<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let _ = array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = my_pe).block(); //we will used this val as completion detection
     /// array.barrier();
     /// println!("PE{my_pe} array data: {:?}",array.read_local_data().block());
@@ -1561,7 +1561,7 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// let len = buf.len();
     /// let _ = array.dist_iter_mut().for_each(move |elem| *elem = len).spawn(); //we will used this val as completion detection
@@ -1631,8 +1631,8 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let block_array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
-    /// let cyclic_array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let block_array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
+    /// let cyclic_array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let _ = block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
     ///     elem.store(i);
@@ -1689,7 +1689,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 ///```
 /// use lamellar::array::prelude::*;
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
 /// use rand::Rng;
 ///
 /// let array_clone = array.clone();
@@ -1705,7 +1705,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// use lamellar::array::prelude::*;
 /// use rand::Rng;
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
 /// let array_clone = array.clone();
 /// let req = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -1722,7 +1722,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// use rand::Rng;
 /// let world = LamellarWorldBuilder::new().build();
 /// let num_pes = world.num_pes();
-/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
 /// let array_clone = array.clone();
 /// let req = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -1740,7 +1740,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 /// use rand::Rng;
 /// let world = LamellarWorldBuilder::new().build();
 /// let num_pes = world.num_pes();
-/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
 /// let array_clone = array.clone();
 /// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -1804,7 +1804,7 @@ where
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -1837,7 +1837,7 @@ where
 //     /// use rand::Rng;
 //     /// let world = LamellarWorldBuilder::new().build();
 //     /// let num_pes = world.num_pes();
-//     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+//     /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
 //     /// let array_clone = array.clone();
 //     /// let req = array.local_iter().for_each(move |_| {
 //     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -1863,7 +1863,7 @@ where
 //     /// use lamellar::array::prelude::*;
 //     /// let world = LamellarWorldBuilder::new().build();
 //     /// let num_pes = world.num_pes();
-//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
 //     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
 //     ///     elem.store(i+1);
 //     /// });
@@ -1894,7 +1894,7 @@ where
 //     /// use lamellar::array::prelude::*;
 //     /// let world = LamellarWorldBuilder::new().build();
 //     /// let num_pes = world.num_pes();
-//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
 //     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
 //     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
 //     /// let max = array.block_on(array.max());
@@ -1916,7 +1916,7 @@ where
 //     /// use lamellar::array::prelude::*;
 //     /// let world = LamellarWorldBuilder::new().build();
 //     /// let num_pes = world.num_pes();
-//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+//     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
 //     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
 //     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
 //     /// let min = array.block_on(array.min());
@@ -1955,7 +1955,7 @@ where
 /// );
 /// let world = LamellarWorldBuilder::new().build();
 /// let num_pes = world.num_pes();
-/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
 /// let array_clone = array.clone();
 /// let _ = array.local_iter().for_each(move |_| {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index a3169e17..2e6ae755 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -2,6 +2,9 @@ mod iteration;
 pub(crate) mod operations;
 pub(crate) mod rdma;
 
+pub(crate) mod handle;
+pub use handle::AtomicArrayHandle;
+
 use crate::active_messaging::ActiveMessaging;
 use crate::array::generic_atomic::{GenericAtomicElement, LocalGenericAtomicElement};
 use crate::array::iterator::distributed_iterator::DistIteratorLauncher;
@@ -61,12 +64,12 @@ impl<T: Dist> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// println!("PE{my_pe} elem: {:?}",local_data.at(10).load());
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # println!("PE{my_pe} elem: {:?}",local_data.at(10).load());
     ///```
@@ -87,12 +90,12 @@ impl<T: Dist> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// local_data.at(10).store(19);
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # local_data.at(10).store(19.0);
     ///```
@@ -113,12 +116,12 @@ impl<T: Dist> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).swap(19);
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # let old_val = local_data.at(10).swap(19.0);
     ///```
@@ -141,12 +144,12 @@ impl<T: ElementArithmeticOps> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_add(19);
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # let old_val = local_data.at(10).fetch_add(19.0);
     ///```
@@ -166,12 +169,12 @@ impl<T: ElementArithmeticOps> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_sub(19);
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # let old_val = local_data.at(10).fetch_sub(19.0);
     ///```
@@ -192,12 +195,12 @@ impl<T: ElementArithmeticOps> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_mul(19);
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # let old_val = local_data.at(10).fetch_mul(19.0);
     ///```
@@ -218,12 +221,12 @@ impl<T: ElementArithmeticOps> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_div(19);
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # let old_val = local_data.at(10).fetch_div(19.0);
     ///```
@@ -249,7 +252,7 @@ impl<T: Dist + std::cmp::Eq> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let result = local_data.at(10).compare_exchange(19,10);
@@ -280,12 +283,12 @@ impl<T: Dist + std::cmp::PartialEq + std::cmp::PartialOrd + std::ops::Sub<Output
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let result = local_data.at(10).compare_exchange_epsilon(19,10,1);
     ///
-    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block); // test genericatomic
+    /// # let array2: AtomicArray<f32>  = AtomicArray::new(&world,100,Distribution::Block).block(); // test genericatomic
     /// # let local_data = array2.local_data();
     /// # let result = local_data.at(10).compare_exchange_epsilon(19.0,10.0,0.1);
     ///```
@@ -314,7 +317,7 @@ impl<T: ElementBitWiseOps + 'static> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_and(0b0011);
@@ -335,7 +338,7 @@ impl<T: ElementBitWiseOps + 'static> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_or(0b0011);
@@ -359,7 +362,7 @@ impl<T: ElementShiftOps + 'static> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,16,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,16,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_shl(2);
@@ -380,7 +383,7 @@ impl<T: ElementShiftOps + 'static> AtomicElement<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,16,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,16,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// let old_val = local_data.at(10).fetch_shr(2);
@@ -752,7 +755,7 @@ impl<T: Dist> AtomicLocalData<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     ///
@@ -776,7 +779,7 @@ impl<T: Dist> AtomicLocalData<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     ///
@@ -795,7 +798,7 @@ impl<T: Dist> AtomicLocalData<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     ///
@@ -814,7 +817,7 @@ impl<T: Dist> AtomicLocalData<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     ///
@@ -867,12 +870,12 @@ impl<T: Dist + ArrayOps + std::default::Default + 'static> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<f32> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<f32> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     pub fn new<U: Clone + Into<IntoLamellarTeam>>(
         team: U,
         array_size: usize,
         distribution: Distribution,
-    ) -> AtomicArray<T> {
+    ) -> AtomicArrayHandle<T> {
         // println!("new atomic array");
         if NATIVE_ATOMICS.contains(&TypeId::of::<T>()) {
             NativeAtomicArray::new_internal(team, array_size, distribution).into()
@@ -901,9 +904,9 @@ impl<T: Dist> AtomicArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block);
+    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         match self {
@@ -925,7 +928,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// println!("PE{my_pe} local_data[0]: {:?}",local_data.at(0).load());
@@ -949,7 +952,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// println!("PE{my_pe} local_data[0]: {:?}",local_data.at(0).load());
@@ -994,7 +997,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let unsafe_array = array.into_unsafe();
     ///```
@@ -1005,7 +1008,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.local_data();
@@ -1049,7 +1052,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let read_only_array = array.into_read_only();
     ///```
@@ -1059,7 +1062,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = unsafe {array1.local_data()};
@@ -1096,7 +1099,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_lock_array = array.into_local_lock();
     ///```
@@ -1106,7 +1109,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = unsafe {array1.local_data()};
@@ -1143,7 +1146,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let global_lock_array = array.into_global_lock();
     ///```
@@ -1153,7 +1156,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = unsafe {array1.local_data()};
@@ -1310,7 +1313,7 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -1359,7 +1362,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// use rand::Rng;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -1405,7 +1408,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| {
     ///     elem.store(i+1);
     /// }).spawn();
@@ -1450,7 +1453,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
     /// let max = array.block_on(array.max()).expect("array has length > 0");
     /// assert_eq!((array.len()-1)*2,max);
@@ -1492,7 +1495,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();;
     /// let min = array.block_on(array.min()).expect("array has length > 0");
     /// assert_eq!(0,min);
@@ -1520,8 +1523,8 @@ impl<T: Dist + std::fmt::Debug> AtomicArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let block_array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
-    /// let cyclic_array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let block_array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
+    /// let cyclic_array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// block_array.print();
     /// println!();
diff --git a/src/array/atomic/handle.rs b/src/array/atomic/handle.rs
new file mode 100644
index 00000000..d6526569
--- /dev/null
+++ b/src/array/atomic/handle.rs
@@ -0,0 +1,148 @@
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::{
+    generic_atomic::GenericAtomicArrayHandle,
+    native_atomic::{NativeAtomicArray, NativeAtomicArrayHandle, NativeAtomicType},
+};
+use super::{ArrayOps, AtomicArray};
+
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
+use crate::{Dist, LamellarTeamRT};
+
+use futures_util::{ready, Future};
+use pin_project::{pin_project, pinned_drop};
+
+#[must_use = " AtomicArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [AtomicArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the AtomicArray's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `AtomicArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
+/// ```
+pub struct AtomicArrayHandle<T: Dist + ArrayOps + 'static> {
+    pub(crate) inner: InnerAtomicArrayHandle<T>,
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+}
+
+pub(crate) enum InnerAtomicArrayHandle<T: Dist + ArrayOps + 'static> {
+    Generic(GenericAtomicArrayHandle<T>),
+    Native(NativeAtomicArrayHandle<T>),
+}
+impl<T: Dist + ArrayOps + 'static> InnerAtomicArrayHandle<T> {
+    fn set_launched(&mut self, val: bool) {
+        match self {
+            InnerAtomicArrayHandle::Generic(handle) => handle.launched = val,
+            InnerAtomicArrayHandle::Native(handle) => handle.launched = val,
+        }
+    }
+}
+
+#[pinned_drop]
+impl<T: Dist + ArrayOps + 'static> PinnedDrop for AtomicArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a AtomicArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> AtomicArrayHandle<T> {
+    /// Used to drive creation of a new AtomicArray
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
+    pub fn block(mut self) -> AtomicArray<T> {
+        self.launched = true;
+        self.inner.set_launched(true);
+        RuntimeWarning::BlockingCall(
+            "AtomicArrayHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the AtomicArray on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array_task: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// // do some other work
+    /// let array = array_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<AtomicArray<T>> {
+        self.launched = true;
+        self.inner.set_launched(true);
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Future for AtomicArrayHandle<T> {
+    type Output = AtomicArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        self.inner.set_launched(true);
+        let this = self.project();
+        match this.inner {
+            InnerAtomicArrayHandle::Generic(ref mut handle) => {
+                let array = ready!(handle.creation_future.as_mut().poll(cx));
+                Poll::Ready(AtomicArray::GenericAtomicArray(array))
+            }
+            InnerAtomicArrayHandle::Native(ref mut handle) => {
+                let array = ready!(handle.creation_future.as_mut().poll(cx));
+                Poll::Ready(AtomicArray::NativeAtomicArray(NativeAtomicArray {
+                    array,
+                    orig_t: NativeAtomicType::of::<T>(),
+                }))
+            }
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Into<AtomicArrayHandle<T>> for GenericAtomicArrayHandle<T> {
+    fn into(self) -> AtomicArrayHandle<T> {
+        let team = self.team.clone();
+        let launched = self.launched;
+        AtomicArrayHandle {
+            inner: InnerAtomicArrayHandle::Generic(self),
+            team,
+            launched,
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Into<AtomicArrayHandle<T>> for NativeAtomicArrayHandle<T> {
+    fn into(self) -> AtomicArrayHandle<T> {
+        let team = self.team.clone();
+        let launched = self.launched;
+        AtomicArrayHandle {
+            inner: InnerAtomicArrayHandle::Native(self),
+            team,
+            launched,
+        }
+    }
+}
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 4ba36c2b..247b951d 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -1,3 +1,6 @@
+mod handle;
+pub(crate) use handle::GenericAtomicArrayHandle;
+
 pub(crate) mod iteration;
 pub(crate) mod operations;
 mod rdma;
@@ -411,19 +414,30 @@ impl<T: Dist + ArrayOps + std::default::Default> GenericAtomicArray<T> {
         team: U,
         array_size: usize,
         distribution: Distribution,
-    ) -> GenericAtomicArray<T> {
+    ) -> GenericAtomicArrayHandle<T> {
         // println!("new generic_atomic array");
-        let array = UnsafeArray::new(team.clone(), array_size, distribution);
-        array.block_on_outstanding(DarcMode::GenericAtomicArray);
-        let mut vec = vec![];
-        for _i in 0..array.num_elems_local() {
-            vec.push(Mutex::new(()));
-        }
-        let locks = Darc::new(team, vec).unwrap();
 
-        GenericAtomicArray {
-            locks: locks,
-            array: array,
+        let team = team.into().team.clone();
+        GenericAtomicArrayHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(async move {
+                let array = UnsafeArray::async_new(
+                    team.clone(),
+                    array_size,
+                    distribution,
+                    DarcMode::LocalLockArray,
+                )
+                .await;
+                let mut vec = vec![];
+                for _i in 0..array.num_elems_local() {
+                    vec.push(Mutex::new(()));
+                }
+                GenericAtomicArray {
+                    locks: Darc::new(team, vec).await.expect("pe exists in team"),
+                    array,
+                }
+            }),
         }
     }
 }
@@ -605,7 +619,7 @@ impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
         for _i in 0..array.num_elems_local() {
             vec.push(Mutex::new(()));
         }
-        let locks = Darc::new(array.team_rt(), vec).unwrap();
+        let locks = Darc::new(array.team_rt(), vec).block().unwrap();
 
         GenericAtomicArray {
             locks: locks,
@@ -625,7 +639,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GenericAtomicArray<T> {
         for _i in 0..array.num_elems_local() {
             vec.push(Mutex::new(()));
         }
-        let locks = Darc::new(array.team_rt(), vec).unwrap();
+        let locks = Darc::new(array.team_rt(), vec).block().unwrap();
 
         GenericAtomicArray {
             locks: locks,
@@ -860,8 +874,8 @@ impl<T: Dist + std::fmt::Debug> GenericAtomicArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let block_array = AtomicArray::<f32>::new(&world,100,Distribution::Block);
-    /// let cyclic_array = AtomicArray::<f32>::new(&world,100,Distribution::Block);
+    /// let block_array = AtomicArray::<f32>::new(&world,100,Distribution::Block).block();
+    /// let cyclic_array = AtomicArray::<f32>::new(&world,100,Distribution::Block).block();
     ///
     /// block_array.print();
     /// println!();
diff --git a/src/array/generic_atomic/handle.rs b/src/array/generic_atomic/handle.rs
new file mode 100644
index 00000000..8502bdcb
--- /dev/null
+++ b/src/array/generic_atomic/handle.rs
@@ -0,0 +1,92 @@
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::{ArrayOps, GenericAtomicArray};
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
+use crate::{Dist, LamellarTeamRT};
+
+use futures_util::Future;
+use pin_project::{pin_project, pinned_drop};
+
+#[must_use = " GenericAtomicArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [GenericAtomicArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the GenericAtomicArray's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `GenericAtomicArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array: GenericAtomicArray<usize> = GenericAtomicArray::new(&world,100,Distribution::Cyclic).block();
+/// ```
+pub(crate) struct GenericAtomicArrayHandle<T: Dist + ArrayOps + 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = GenericAtomicArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist + ArrayOps + 'static> PinnedDrop for GenericAtomicArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a GenericAtomicArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> GenericAtomicArrayHandle<T> {
+    /// Used to drive creation of a new GenericAtomicArray
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: GenericAtomicArray<usize> = GenericAtomicArray::new(&world,100,Distribution::Cyclic).block();
+    pub fn block(mut self) -> GenericAtomicArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "GenericAtomicArrayHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the GenericAtomicArray on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array_task: GenericAtomicArray<usize> = GenericAtomicArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// // do some other work
+    /// let array = array_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<GenericAtomicArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Future for GenericAtomicArrayHandle<T> {
+    type Output = GenericAtomicArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        this.creation_future.as_mut().poll(cx)
+    }
+}
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 1243e754..d4155b4e 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -24,7 +24,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index d2822d9e..555a1dac 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -1,6 +1,6 @@
-mod handle;
+pub(crate) mod handle;
 use handle::{
-    GlobalLockCollectiveMutLocalDataHandle, GlobalLockLocalDataHandle,
+    GlobalLockArrayHandle, GlobalLockCollectiveMutLocalDataHandle, GlobalLockLocalDataHandle,
     GlobalLockMutLocalDataHandle, GlobalLockReadHandle, GlobalLockWriteHandle,
 };
 mod iteration;
@@ -203,7 +203,7 @@ impl<T: Dist> GlobalLockLocalData<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.read_local_data().block();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
@@ -320,18 +320,29 @@ impl<T: Dist + ArrayOps + std::default::Default> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     pub fn new<U: Clone + Into<IntoLamellarTeam>>(
         team: U,
         array_size: usize,
         distribution: Distribution,
-    ) -> GlobalLockArray<T> {
-        let array = UnsafeArray::new(team.clone(), array_size, distribution);
-        let lock = GlobalRwDarc::new(team, ()).unwrap();
-
-        GlobalLockArray {
-            lock: lock,
-            array: array,
+    ) -> GlobalLockArrayHandle<T> {
+        let team = team.into().team.clone();
+        GlobalLockArrayHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(async move {
+                let lock_task = GlobalRwDarc::new(team.clone(), ()).spawn();
+                GlobalLockArray {
+                    lock: lock_task.await.expect("pe exists in team"),
+                    array: UnsafeArray::async_new(
+                        team.clone(),
+                        array_size,
+                        distribution,
+                        DarcMode::GlobalLockArray,
+                    )
+                    .await,
+                }
+            }),
         }
     }
 }
@@ -347,9 +358,9 @@ impl<T: Dist> GlobalLockArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block);
+    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         GlobalLockArray {
@@ -372,7 +383,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_lock();
     /// let task = world.spawn(async move {
     ///     let read_lock = handle.await;
@@ -399,7 +410,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_lock();
     /// let task = world.spawn(async move {
     ///     let write_lock = handle.await;
@@ -427,7 +438,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_local_data();
     /// world.spawn(async move {
     ///     let local_data = handle.await;
@@ -461,7 +472,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_local_data();
     /// world.spawn(async move {
     ///     let mut local_data = handle.await;
@@ -497,7 +508,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.collective_write_local_data();
     /// world.block_on(async move {
     ///     let mut local_data = handle.await;
@@ -540,7 +551,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let unsafe_array = array.into_unsafe();
     ///```
@@ -551,7 +562,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -589,7 +600,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let read_only_array = array.into_read_only();
     ///```
@@ -599,7 +610,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -633,7 +644,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let read_only_array = array.into_read_only();
     ///```
@@ -643,7 +654,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -679,7 +690,7 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let atomic_array = array.into_atomic();
     ///```
@@ -689,7 +700,7 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -729,7 +740,7 @@ impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("GlobalLock from unsafe");
         array.block_on_outstanding(DarcMode::GlobalLockArray);
-        let lock = GlobalRwDarc::new(array.team_rt(), ()).unwrap();
+        let lock = GlobalRwDarc::new(array.team_rt(), ()).block().unwrap();
 
         GlobalLockArray {
             lock: lock,
@@ -743,7 +754,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
         // println!("GlobalLock from unsafe");
         array.await_on_outstanding(DarcMode::GlobalLockArray).await;
-        let lock = GlobalRwDarc::new(array.team_rt(), ()).unwrap();
+        let lock = GlobalRwDarc::new(array.team_rt(), ()).block().unwrap();
 
         GlobalLockArray {
             lock: lock,
@@ -987,8 +998,8 @@ impl<T: Dist + std::fmt::Debug> GlobalLockArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let block_array = GlobalLockArray::<usize>::new(&world,100,Distribution::Block);
-    /// let cyclic_array = GlobalLockArray::<usize>::new(&world,100,Distribution::Block);
+    /// let block_array = GlobalLockArray::<usize>::new(&world,100,Distribution::Block).block();
+    /// let cyclic_array = GlobalLockArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// block_array.print();
     /// println!();
@@ -1089,7 +1100,7 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.reduce("prod")).expect("array has > 0 elements");
@@ -1122,7 +1133,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// use rand::Rng;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let sum = array.block_on(read_guard.sum()).expect("array has > 0 elements");
@@ -1150,7 +1161,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> GlobalLockReadGuard<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i+1));
     /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.prod()).expect("array has > 0 elements");
@@ -1180,7 +1191,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let max = array.block_on(read_guard.max()).expect("array has > 0 elements");
@@ -1209,7 +1220,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> GlobalLockReadGuar
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = GlobalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let min = array.block_on(read_guard.min()).expect("array has > 0 elements");
diff --git a/src/array/global_lock_atomic/handle.rs b/src/array/global_lock_atomic/handle.rs
index 4beec56a..db471451 100644
--- a/src/array/global_lock_atomic/handle.rs
+++ b/src/array/global_lock_atomic/handle.rs
@@ -1,4 +1,5 @@
 use std::pin::Pin;
+use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use crate::darc::handle::{
@@ -6,17 +7,98 @@ use crate::darc::handle::{
 };
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
-use crate::Dist;
 use crate::GlobalLockArray;
+use crate::{Dist, LamellarTeamRT};
 
 use futures_util::Future;
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 
 use super::{
-    GlobalLockCollectiveMutLocalData, GlobalLockLocalData, GlobalLockMutLocalData,
+    ArrayOps, GlobalLockCollectiveMutLocalData, GlobalLockLocalData, GlobalLockMutLocalData,
     GlobalLockReadGuard, GlobalLockWriteGuard,
 };
 
+#[must_use = " GlobalLockArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [GlobalLockArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the GlobalLockArray's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `GlobalLockArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
+/// ```
+pub struct GlobalLockArrayHandle<T: Dist + ArrayOps + 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = GlobalLockArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist + ArrayOps + 'static> PinnedDrop for GlobalLockArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a GlobalLockArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> GlobalLockArrayHandle<T> {
+    /// Used to drive creation of a new GlobalLockArray
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
+    pub fn block(mut self) -> GlobalLockArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "GlobalLockArrayHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the GlobalLockArray on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array_task: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// // do some other work
+    /// let array = array_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<GlobalLockArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Future for GlobalLockArrayHandle<T> {
+    type Output = GlobalLockArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        this.creation_future.as_mut().poll(cx)
+    }
+}
+
 #[must_use = "GlobalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project] //unused drop warning triggered by GlobalRwDarcReadHandle
 /// Handle used to retrieve the aquired read lock of a GlobalLockArray
@@ -31,7 +113,7 @@ use super::{
 /// use lamellar::array::prelude::*;
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.read_lock();
 /// let task = world.spawn(async move {
 ///     let read_lock = handle.await;
@@ -61,7 +143,7 @@ impl<T: Dist> GlobalLockReadHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_lock();
     /// let guard = handle.block();
     ///```
@@ -84,7 +166,7 @@ impl<T: Dist> GlobalLockReadHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_lock();
     /// let task = handle.spawn(); // initiate getting the read lock
     /// // do other work
@@ -125,7 +207,7 @@ impl<T: Dist> Future for GlobalLockReadHandle<T> {
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
 ///
-/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.read_local_data();
 /// world.spawn(async move {
 ///     let  local_data = handle.await;
@@ -152,7 +234,7 @@ impl<T: Dist> GlobalLockLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_local_data();
     /// let  local_data = handle.block();
     /// println!("local data: {:?}",local_data);
@@ -176,7 +258,7 @@ impl<T: Dist> GlobalLockLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_local_data();
     /// let task = handle.spawn(); // initiate getting the read lock
     /// // do other work
@@ -220,7 +302,7 @@ impl<T: Dist> Future for GlobalLockLocalDataHandle<T> {
 /// use lamellar::array::prelude::*;
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.write_lock();
 /// let task = world.spawn(async move {
 ///     let write_lock = handle.await;
@@ -250,7 +332,7 @@ impl<T: Dist> GlobalLockWriteHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_lock();
     /// let guard = handle.block();
     ///```
@@ -273,7 +355,7 @@ impl<T: Dist> GlobalLockWriteHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_lock();
     /// let task = handle.spawn(); // initiate getting the read lock
     /// // do other work
@@ -313,7 +395,7 @@ impl<T: Dist> Future for GlobalLockWriteHandle<T> {
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
 ///
-/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.write_local_data();
 /// world.spawn(async move {
 ///     let mut local_data = handle.await;
@@ -340,7 +422,7 @@ impl<T: Dist> GlobalLockMutLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_local_data();
     /// let mut local_data = handle.block();
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
@@ -365,7 +447,7 @@ impl<T: Dist> GlobalLockMutLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_local_data();
     /// let task = handle.spawn(); // initiate getting the read lock
     /// // do other work
@@ -410,7 +492,7 @@ impl<T: Dist> Future for GlobalLockMutLocalDataHandle<T> {
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
 ///
-/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.collective_write_local_data();
 /// world.block_on(async move {
 ///     let mut local_data = handle.await;
@@ -438,7 +520,7 @@ impl<T: Dist> GlobalLockCollectiveMutLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.collective_write_local_data();
     /// let mut local_data = handle.block();
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
@@ -463,7 +545,7 @@ impl<T: Dist> GlobalLockCollectiveMutLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.collective_write_local_data();
     /// let task = handle.spawn(); // initiate getting the read lock
     /// // do other work
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index dddbefcc..6e224233 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -34,7 +34,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
         let req = self.exec_am_local_tg(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/iterator/consumer.rs b/src/array/iterator/consumer.rs
index 9826e737..cd11f599 100644
--- a/src/array/iterator/consumer.rs
+++ b/src/array/iterator/consumer.rs
@@ -14,7 +14,6 @@ use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
-use super::private::Sealed;
 
 // trait Consumer{
 //     type Item;
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index a515a195..82abb3f7 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -188,7 +188,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// let init_iter = array.dist_iter_mut().for_each(move|e| *e = my_pe).spawn(); //initialize array
@@ -221,7 +221,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block();
@@ -255,7 +255,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.dist_iter().map(|elem| *elem as f64).monotonic().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -288,7 +288,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,16,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,16,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter_mut().for_each(move|e| *e = my_pe).block();
@@ -326,7 +326,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// world.block_on(
     ///     array
@@ -358,7 +358,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let iter = array.dist_iter().for_each_async(|elem| async move {
     ///     async_std::task::yield_now().await;
@@ -393,7 +393,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// array.block_on(array.dist_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id())));
     ///```
@@ -423,7 +423,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let iter = array.dist_iter().for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
     ///     async_std::task::yield_now().await;
@@ -452,7 +452,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter().map(|elem| *elem).reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
@@ -478,7 +478,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter().map(|elem| *elem).reduce_with_schedule(Schedule::Static,|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
@@ -512,12 +512,12 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter()
     ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
     ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    ///                .collect::<AtomicArray<usize>>(Distribution::Block);
+    ///                .collect::<AtomicArray<usize>>(Distribution::Block).block();
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -547,12 +547,12 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter()
     ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
     ///                .filter(|elem| * elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    ///                .collect::<AtomicArray<usize>>(Distribution::Block);
+    ///                .collect::<AtomicArray<usize>>(Distribution::Block).block();
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -590,7 +590,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -603,7 +603,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic).block();
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -638,7 +638,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -651,7 +651,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic);
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic).block();
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -681,7 +681,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter().filter(|elem|  **elem < 10).count();
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
@@ -703,7 +703,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter().filter(|elem|  **elem < 10).count_with_schedule(Schedule::Dynamic);
     /// let cnt = array.block_on(req); //wait on the collect request to get the new array
@@ -728,7 +728,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter().map(|elem| *elem).sum();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
@@ -757,7 +757,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter().map(|elem| *elem).sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
@@ -780,7 +780,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + InnerIter
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.dist_iter().enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -807,7 +807,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + InnerIter
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.dist_iter().enumerate().skip(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -831,7 +831,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + InnerIter
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.dist_iter().enumerate().step_by(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -853,7 +853,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + InnerIter
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.dist_iter().enumerate().take(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -884,7 +884,7 @@ pub trait IndexedDistributedIterator: DistributedIterator + SyncSend + InnerIter
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = LocalLockArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = LocalLockArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let dist_iter = array.dist_iter().for_each(move |e| println!("{e}"));
 /// world.block_on(dist_iter);
@@ -1000,7 +1000,7 @@ impl<
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = LocalLockArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = LocalLockArray::<usize>::new(&world,100,Distribution::Block).block();
 /// let my_pe = world.my_pe();
 /// let dist_iter = array.dist_iter_mut().for_each(move |e| *e = my_pe );
 /// world.block_on(dist_iter);
diff --git a/src/array/iterator/distributed_iterator/consumer/count.rs b/src/array/iterator/distributed_iterator/consumer/count.rs
index d3a7a7b6..a4401dce 100644
--- a/src/array/iterator/distributed_iterator/consumer/count.rs
+++ b/src/array/iterator/distributed_iterator/consumer/count.rs
@@ -107,7 +107,7 @@ impl LamellarAm for UpdateCntAm {
 
 impl InnerDistIterCountHandle {
     async fn async_reduce_remote_counts(local_cnt: usize, team: Pin<Arc<LamellarTeamRT>>) -> usize {
-        let cnt = Darc::async_try_new(&team, AtomicUsize::new(0), DarcMode::Darc)
+        let cnt = Darc::async_try_new_with_drop(&team, AtomicUsize::new(0), DarcMode::Darc, None)
             .await
             .unwrap();
         team.exec_am_all(UpdateCntAm {
@@ -119,18 +119,18 @@ impl InnerDistIterCountHandle {
         cnt.load(Ordering::SeqCst)
     }
 
-    fn reduce_remote_counts(&self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
-        let _ = self
-            .team
-            .exec_am_all(UpdateCntAm {
-                remote_cnt: local_cnt,
-                cnt: cnt.clone(),
-            })
-            .spawn();
-        self.team.wait_all();
-        self.team.tasking_barrier();
-        cnt.load(Ordering::SeqCst)
-    }
+    // fn reduce_remote_counts(&self, local_cnt: usize, cnt: Darc<AtomicUsize>) -> usize {
+    //     let _ = self
+    //         .team
+    //         .exec_am_all(UpdateCntAm {
+    //             remote_cnt: local_cnt,
+    //             cnt: cnt.clone(),
+    //         })
+    //         .spawn();
+    //     self.team.wait_all();
+    //     self.team.tasking_barrier();
+    //     cnt.load(Ordering::SeqCst)
+    // }
 }
 
 impl Future for InnerDistIterCountHandle {
@@ -253,7 +253,7 @@ impl Future for DistIterCountHandle {
         match this.state.as_mut().project() {
             StateProj::Lock(lock, inner) => {
                 ready!(lock.poll(cx));
-                let mut barrier = this.array.barrier_handle();
+                let barrier = this.array.barrier_handle();
                 *this.state = State::Barrier(
                     barrier,
                     inner.take().expect("reqs should still be in this state"),
diff --git a/src/array/iterator/distributed_iterator/consumer/reduce.rs b/src/array/iterator/distributed_iterator/consumer/reduce.rs
index bb68a949..cbb24cb5 100644
--- a/src/array/iterator/distributed_iterator/consumer/reduce.rs
+++ b/src/array/iterator/distributed_iterator/consumer/reduce.rs
@@ -106,8 +106,13 @@ where
         team: Pin<Arc<LamellarTeamRT>>,
         op: F,
     ) -> Option<T> {
-        let local_vals =
-            UnsafeArray::<T>::async_new(&team, team.num_pes, Distribution::Block).await;
+        let local_vals = UnsafeArray::<T>::async_new(
+            &team,
+            team.num_pes,
+            Distribution::Block,
+            crate::darc::DarcMode::UnsafeArray,
+        )
+        .await;
         unsafe {
             local_vals.local_as_mut_slice()[0] = local_val;
         };
@@ -126,19 +131,20 @@ where
         )
     }
 
-    fn reduce_remote_vals(&self, local_val: T) -> Option<T> {
-        self.team.tasking_barrier();
-        let local_vals = UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block);
-        unsafe {
-            local_vals.local_as_mut_slice()[0] = local_val;
-        };
-        local_vals.tasking_barrier();
-        let buffered_iter = unsafe { local_vals.buffered_onesided_iter(self.team.num_pes) };
-        buffered_iter
-            .into_iter()
-            .map(|&x| x)
-            .reduce(self.op.clone())
-    }
+    // fn reduce_remote_vals(&self, local_val: T) -> Option<T> {
+    //     // self.team.tasking_barrier();
+    //     let local_vals =
+    //         UnsafeArray::<T>::new(&self.team, self.team.num_pes, Distribution::Block).block();
+    //     unsafe {
+    //         local_vals.local_as_mut_slice()[0] = local_val;
+    //     };
+    //     local_vals.tasking_barrier();
+    //     let buffered_iter = unsafe { local_vals.buffered_onesided_iter(self.team.num_pes) };
+    //     buffered_iter
+    //         .into_iter()
+    //         .map(|&x| x)
+    //         .reduce(self.op.clone())
+    // }
 }
 
 impl<T, F> Future for InnerDistIterReduceHandle<T, F>
diff --git a/src/array/iterator/distributed_iterator/consumer/sum.rs b/src/array/iterator/distributed_iterator/consumer/sum.rs
index a575739e..a4c95a55 100644
--- a/src/array/iterator/distributed_iterator/consumer/sum.rs
+++ b/src/array/iterator/distributed_iterator/consumer/sum.rs
@@ -93,8 +93,13 @@ where
     T: Dist + ArrayOps + std::iter::Sum,
 {
     async fn async_reduce_remote_vals(local_sum: T, team: Pin<Arc<LamellarTeamRT>>) -> T {
-        let local_sums =
-            UnsafeArray::<T>::async_new(&team, team.num_pes, Distribution::Block).await;
+        let local_sums = UnsafeArray::<T>::async_new(
+            &team,
+            team.num_pes,
+            Distribution::Block,
+            crate::darc::DarcMode::UnsafeArray,
+        )
+        .await;
         unsafe {
             local_sums.local_as_mut_slice()[0] = local_sum;
         };
@@ -109,20 +114,20 @@ where
         }
     }
 
-    fn reduce_remote_vals(&self, local_sum: T, local_sums: UnsafeArray<T>) -> T {
-        unsafe {
-            local_sums.local_as_mut_slice()[0] = local_sum;
-        };
-        local_sums.tasking_barrier();
-        // let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
-        // buffered_iter.into_iter().map(|&e| e).sum()
-        unsafe {
-            local_sums
-                .sum()
-                .blocking_wait()
-                .expect("array size is greater than zero")
-        }
-    }
+    // fn reduce_remote_vals(&self, local_sum: T, local_sums: UnsafeArray<T>) -> T {
+    //     unsafe {
+    //         local_sums.local_as_mut_slice()[0] = local_sum;
+    //     };
+    //     local_sums.tasking_barrier();
+    //     // let buffered_iter = unsafe { local_sums.buffered_onesided_iter(self.team.num_pes) };
+    //     // buffered_iter.into_iter().map(|&e| e).sum()
+    //     unsafe {
+    //         local_sums
+    //             .sum()
+    //             .blocking_wait()
+    //             .expect("array size is greater than zero")
+    //     }
+    // }
 }
 
 impl<T> Future for InnerDistIterSumHandle<T>
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 50fff246..cfa84852 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -182,7 +182,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// let init_iter = array.local_iter_mut().for_each(move|e| *e = my_pe).spawn(); //initialize array
@@ -215,7 +215,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array
@@ -249,7 +249,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter().enumerate().map(|(i,elem)| (i,*elem as f64)).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -282,7 +282,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,16,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,16,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array
@@ -319,7 +319,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// world.block_on(
     ///     array
@@ -346,7 +346,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// array.local_iter().for_each_with_schedule(Schedule::WorkStealing, |elem| println!("{:?} {elem}",std::thread::current().id())).block();
     ///```
@@ -375,7 +375,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let iter = array.local_iter().for_each_async(|elem| async move {
     ///     async_std::task::yield_now().await;
@@ -413,7 +413,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// array.local_iter().for_each_async_with_schedule(Schedule::Chunk(10),|elem| async move {
     ///     async_std::task::yield_now().await;
@@ -439,7 +439,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.local_iter().map(|elem| *elem).reduce(|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
@@ -464,7 +464,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.local_iter().map(|elem| *elem).reduce_with_schedule(Schedule::Chunk(10),|acc,elem| acc+elem);
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
@@ -493,10 +493,10 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
     ///
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
+    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic).block();
     /// let new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -519,10 +519,10 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
     ///
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Schedule::WorkStealing,Distribution::Cyclic);
+    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Schedule::WorkStealing,Distribution::Cyclic).block();
     /// let new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -560,7 +560,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -573,7 +573,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
+    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic).block();
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -608,7 +608,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -621,7 +621,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic);
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic).block();
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -649,7 +649,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.local_iter().count();
     /// let cnt = array.block_on(req);
@@ -669,7 +669,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.local_iter().count_with_schedule(Schedule::Dynamic);
     /// let cnt = array.block_on(req);
@@ -693,7 +693,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.local_iter().map(|elem| *elem).sum().spawn();
     /// let sum = array.block_on(req); //wait on the collect request to get the new array
@@ -720,7 +720,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.local_iter().map(|elem| *elem).sum_with_schedule(Schedule::Guided);
     /// let sum = array.block_on(req);
@@ -743,7 +743,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter().enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -779,7 +779,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,40,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter().chunks(5).enumerate().for_each(move|(i,chunk)| {
@@ -809,7 +809,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     // /// use lamellar::array::prelude::*;
     // ///
     // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block);
+    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,8,Distribution::Block).block();
     // /// let my_pe = world.my_pe();
     // ///
     // /// array.local_iter().map(|elem| *elem as f64).enumerate().for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}"));
@@ -841,7 +841,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter().enumerate().skip(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -864,7 +864,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,28,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,28,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// let _ =array.local_iter().enumerate().step_by(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).spawn();
@@ -896,7 +896,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// array.local_iter().enumerate().take(3).for_each(move|(i,elem)| println!("PE: {my_pe} i: {i} elem: {elem}")).block();
@@ -928,8 +928,8 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array_A: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block);
-    /// let array_B: LocalLockArray<usize> = LocalLockArray::new(&world,12,Distribution::Block);
+    /// let array_A: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,16,Distribution::Block).block();
+    /// let array_B: LocalLockArray<usize> = LocalLockArray::new(&world,12,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// //initalize array_B
@@ -969,7 +969,7 @@ pub trait IndexedLocalIterator: LocalIterator + SyncSend + InnerIter + 'static {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let local_iter = array.local_iter().for_each(move|e| println!("{}",e.load()));
 /// world.block_on(local_iter);
@@ -1088,7 +1088,7 @@ impl<
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 /// let my_pe = world.my_pe();
 ///
 /// let local_iter = array.local_iter_mut().for_each(move|e| e.store(my_pe) );
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 8bdb385b..84eb41d1 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -4,7 +4,7 @@ use crate::array::iterator::private::*;
 use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::operations::ArrayOps;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
-use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution};
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
@@ -192,10 +192,10 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
         let input = (local_vals, dist);
         AsyncTeamInto::team_into(input, &team).await
     }
-    fn create_array(&self, local_vals: Vec<T>) -> A {
-        let input = (local_vals, self.distribution);
-        TeamInto::team_into(input, &self.team)
-    }
+    // fn create_array(&self, local_vals: Vec<T>) -> A {
+    //     let input = (local_vals, self.distribution);
+    //     TeamInto::team_into(input, &self.team)
+    // }
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static> Future
diff --git a/src/array/iterator/mod.rs b/src/array/iterator/mod.rs
index 6ad0e92e..3f4168df 100644
--- a/src/array/iterator/mod.rs
+++ b/src/array/iterator/mod.rs
@@ -80,7 +80,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// world.block_on(
     ///     array.dist_iter().for_each(move |elem| println!("PE{my_pe} elem {elem}"))
@@ -100,7 +100,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic);
+    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// world.block_on(
     ///     array.local_iter().for_each(move |elem| println!("PE{my_pe} elem {}",elem.load())) // "load" is specific to AtomicArray elements, other types can deref the element directly"
@@ -120,7 +120,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// if my_pe == 0 {
     ///     for elem in array.onesided_iter().into_iter() { //"into_iter()" converts into a standard Rust Iterator
@@ -147,7 +147,7 @@ pub trait LamellarArrayIterators<T: Dist> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// if my_pe == 0 {
     ///     for elem in array.buffered_onesided_iter(100).into_iter() { // "into_iter()" converts into a standard Rust Iterator
@@ -179,7 +179,7 @@ pub trait LamellarArrayMutIterators<T: Dist> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// world.block_on(
     ///     array.dist_iter_mut().for_each(move |elem| *elem = my_pe)
@@ -199,7 +199,7 @@ pub trait LamellarArrayMutIterators<T: Dist> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// world.block_on(
     ///    array.local_iter_mut().for_each(move |elem| *elem = my_pe)
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index 1a1e0642..14ada186 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -107,7 +107,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,24,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,24,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
@@ -140,7 +140,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
@@ -171,7 +171,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
@@ -201,8 +201,8 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array_A = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
-    /// let array_B: LocalLockArray<usize> = LocalLockArray::new(&world,12,Distribution::Block);
+    /// let array_A = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
+    /// let array_B: LocalLockArray<usize> = LocalLockArray::new(&world,12,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// //initialize arrays using a distributed iterator
     /// let _ = array_A.dist_iter_mut().for_each(move|e| *e = my_pe).spawn();
@@ -248,7 +248,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// array.dist_iter_mut().for_each(move|e| *e = my_pe).block(); //initialize array using a distributed iterator
     /// if my_pe == 0 {
@@ -279,7 +279,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
     /// use futures_util::stream::{StreamExt};
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,8,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     /// let _ =array.dist_iter_mut().for_each(move|e| *e = my_pe).spawn(); //initialize array using a distributed iterator
@@ -313,7 +313,7 @@ pub trait OneSidedIterator: private::OneSidedIteratorInner {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let std_iter = array.onesided_iter().into_iter();
 /// for e in std_iter {
@@ -346,7 +346,7 @@ where
 /// use futures_util::stream::StreamExt;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 /// world.block_on(async move {
 ///     let mut stream = array.onesided_iter().into_stream();
 ///     while let Some(e) = stream.next().await {
@@ -402,7 +402,7 @@ unsafe impl<T: Dist + 'static> Send for SendNonNull<T> {}
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let one_sided_iter = array.onesided_iter();
 ///```
@@ -430,7 +430,7 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> OneSidedIter<'a, T,
         team: Pin<Arc<LamellarTeamRT>>,
         buf_size: usize,
     ) -> OneSidedIter<'a, T, A> {
-        let buf_0 = team.alloc_one_sided_mem_region(buf_size);
+        let buf_0 = team.alloc_one_sided_mem_region_or_panic(buf_size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
         // but safe with respect to the buf_0 as self is the only reference
 
diff --git a/src/array/iterator/one_sided_iterator/chunks.rs b/src/array/iterator/one_sided_iterator/chunks.rs
index 40f53660..2beadc80 100644
--- a/src/array/iterator/one_sided_iterator/chunks.rs
+++ b/src/array/iterator/one_sided_iterator/chunks.rs
@@ -50,7 +50,7 @@ where
     ) -> (OneSidedMemoryRegion<I::ElemType>, ArrayRdmaHandle) {
         // println!(" get chunk of len: {:?}", size);
         let mem_region: OneSidedMemoryRegion<I::ElemType> =
-            array.team_rt().alloc_one_sided_mem_region(size);
+            array.team_rt().alloc_one_sided_mem_region_or_panic(size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
         // but safe with respect to the mem_region as this is the only reference
         let mut req = unsafe { array.internal_get(index, &mem_region) };
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 11a30585..a2d684cc 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -1,10 +1,11 @@
 mod iteration;
 pub(crate) mod local_chunks;
 pub use local_chunks::{LocalLockLocalChunks, LocalLockLocalChunksMut};
-mod handle;
+pub(crate) mod handle;
 use handle::{
-    LocalLockLocalChunksHandle, LocalLockLocalChunksMutHandle, LocalLockLocalDataHandle,
-    LocalLockMutLocalDataHandle, LocalLockReadHandle, LocalLockWriteHandle,
+    LocalLockArrayHandle, LocalLockLocalChunksHandle, LocalLockLocalChunksMutHandle,
+    LocalLockLocalDataHandle, LocalLockMutLocalDataHandle, LocalLockReadHandle,
+    LocalLockWriteHandle,
 };
 pub(crate) mod operations;
 mod rdma;
@@ -168,7 +169,7 @@ impl<T: Dist> LocalLockLocalData<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.read_local_data().block();
     /// let sub_data = local_data.clone().into_sub_data(10,20); // clone() essentially increases the references to the read lock by 1.
@@ -319,19 +320,29 @@ impl<T: Dist + ArrayOps + std::default::Default> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     pub fn new<U: Clone + Into<IntoLamellarTeam>>(
         team: U,
         array_size: usize,
         distribution: Distribution,
-    ) -> LocalLockArray<T> {
-        let array = UnsafeArray::new(team.clone(), array_size, distribution);
-        array.block_on_outstanding(DarcMode::LocalLockArray);
-        let lock = LocalRwDarc::new(team, ()).unwrap();
-
-        LocalLockArray {
-            lock: lock,
-            array: array,
+    ) -> LocalLockArrayHandle<T> {
+        let team = team.into().team.clone();
+        LocalLockArrayHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(async move {
+                let lock_task = LocalRwDarc::new(team.clone(), ()).spawn();
+                LocalLockArray {
+                    lock: lock_task.await.expect("pe exists in team"),
+                    array: UnsafeArray::async_new(
+                        team.clone(),
+                        array_size,
+                        distribution,
+                        DarcMode::LocalLockArray,
+                    )
+                    .await,
+                }
+            }),
         }
     }
 }
@@ -347,9 +358,9 @@ impl<T: Dist> LocalLockArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block);
+    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         LocalLockArray {
@@ -372,7 +383,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_lock();
     /// world.spawn(async move {
     ///     let read_lock = handle.await;
@@ -398,7 +409,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_lock();
     /// world.spawn(async move {
     ///     let write_lock = handle.await;
@@ -425,7 +436,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_local_data();
     /// world.spawn(async move {
     ///     let local_data = handle.await;
@@ -458,7 +469,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_local_data();
     /// world.spawn(async move {
     ///     let mut local_data = handle.await;
@@ -500,7 +511,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let unsafe_array = array.into_unsafe();
     ///```
@@ -511,7 +522,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -549,7 +560,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let read_only_array = array.into_read_only();
     ///```
@@ -559,7 +570,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -593,7 +604,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let global_lock_array = array.into_global_lock();
     ///```
@@ -603,7 +614,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -639,7 +650,7 @@ impl<T: Dist + 'static> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let atomic_array = array.into_atomic();
     ///```
@@ -649,7 +660,7 @@ impl<T: Dist + 'static> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.read_local_data().block();
@@ -689,7 +700,7 @@ impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
     fn from(array: UnsafeArray<T>) -> Self {
         // println!("locallock from unsafe");
         array.block_on_outstanding(DarcMode::LocalLockArray);
-        let lock = LocalRwDarc::new(array.team_rt(), ()).unwrap();
+        let lock = LocalRwDarc::new(array.team_rt(), ()).block().unwrap();
 
         LocalLockArray {
             lock: lock,
@@ -703,7 +714,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
         // println!("locallock from unsafe");
         array.await_on_outstanding(DarcMode::LocalLockArray).await;
-        let lock = LocalRwDarc::new(array.team_rt(), ()).unwrap();
+        let lock = LocalRwDarc::new(array.team_rt(), ()).block().unwrap();
 
         LocalLockArray {
             lock: lock,
@@ -948,8 +959,8 @@ impl<T: Dist + std::fmt::Debug> LocalLockArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let block_array = LocalLockArray::<usize>::new(&world,100,Distribution::Block);
-    /// let cyclic_array = LocalLockArray::<usize>::new(&world,100,Distribution::Block);
+    /// let block_array = LocalLockArray::<usize>::new(&world,100,Distribution::Block).block();
+    /// let cyclic_array = LocalLockArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// block_array.print();
     /// println!();
@@ -1048,7 +1059,7 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.reduce("prod"));
@@ -1083,7 +1094,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// use rand::Rng;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let sum = array.block_on(read_guard.sum());
@@ -1113,7 +1124,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> LocalLockReadGuard<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i+1));
     /// let read_guard = array.read_lock().block();
     /// let prod = array.block_on(read_guard.prod()).expect("array len > 0");
@@ -1145,7 +1156,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let max = array.block_on(read_guard.max()).expect("array len > 0");
@@ -1176,7 +1187,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> LocalLockReadGuard
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = LocalLockArray::<usize>::new(&world,10,Distribution::Block).block();
     /// array.block_on(array.dist_iter_mut().enumerate().for_each(move |(i,elem)| *elem = i*2));
     /// let read_guard = array.read_lock().block();
     /// let min = array.block_on(read_guard.min()).expect("array len > 0");
diff --git a/src/array/local_lock_atomic/handle.rs b/src/array/local_lock_atomic/handle.rs
index e863964d..3122cbbd 100644
--- a/src/array/local_lock_atomic/handle.rs
+++ b/src/array/local_lock_atomic/handle.rs
@@ -5,19 +5,100 @@ use std::task::{Context, Poll};
 use crate::darc::handle::{LocalRwDarcReadHandle, LocalRwDarcWriteHandle};
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
-use crate::Dist;
 use crate::LocalLockArray;
+use crate::{Dist, LamellarTeamRT};
 
 use futures_util::Future;
-use pin_project::pin_project;
+use pin_project::{pin_project, pinned_drop};
 
 use super::{
-    LocalLockLocalChunks, LocalLockLocalChunksMut, LocalLockLocalData, LocalLockMutLocalData,
-    LocalLockReadGuard, LocalLockWriteGuard,
+    ArrayOps, LocalLockLocalChunks, LocalLockLocalChunksMut, LocalLockLocalData,
+    LocalLockMutLocalData, LocalLockReadGuard, LocalLockWriteGuard,
 };
 
+#[must_use = " LocalLockArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [LocalLockArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the LocalLockArray's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `LocalLockArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
+/// ```
+pub struct LocalLockArrayHandle<T: Dist + ArrayOps + 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = LocalLockArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist + ArrayOps + 'static> PinnedDrop for LocalLockArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a LocalLockArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> LocalLockArrayHandle<T> {
+    /// Used to drive creation of a new LocalLockArray
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
+    pub fn block(mut self) -> LocalLockArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "LocalLockArrayHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the LocalLockArray on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array_task: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// // do some other work
+    /// let array = array_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Future for LocalLockArrayHandle<T> {
+    type Output = LocalLockArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        this.creation_future.as_mut().poll(cx)
+    }
+}
+
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project] //unused drop warning triggered by LocalRwDarcReadHandle
+#[pin_project(PinnedDrop)] //unused drop warning triggered by LocalRwDarcReadHandle
 /// Handle used to retrieve the aquired read lock of a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -30,7 +111,7 @@ use super::{
 /// use lamellar::array::prelude::*;
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.read_lock();
 /// let task = world.spawn(async move {
 ///     let read_lock = handle.await;
@@ -45,6 +126,15 @@ pub struct LocalLockReadHandle<T> {
     pub(crate) lock_handle: LocalRwDarcReadHandle<()>,
 }
 
+#[pinned_drop]
+impl<T> PinnedDrop for LocalLockReadHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.lock_handle.launched {
+            RuntimeWarning::DroppedHandle("a LocalLockReadHandle").print();
+        }
+    }
+}
+
 impl<T: Dist> LocalLockReadHandle<T> {
     pub(crate) fn new(array: LocalLockArray<T>) -> Self {
         Self {
@@ -61,7 +151,7 @@ impl<T: Dist> LocalLockReadHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_lock();
     /// let guard = handle.block();
     ///```
@@ -83,7 +173,7 @@ impl<T: Dist> LocalLockReadHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_lock();
     /// let task = handle.spawn(); // initiate getting the read lock
     /// // do other work
@@ -110,7 +200,7 @@ impl<T: Dist> Future for LocalLockReadHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project] //unused drop warning triggered by LocalRwDarcReadHandle
+#[pin_project(PinnedDrop)] //unused drop warning triggered by LocalRwDarcReadHandle
 /// Handle used to retrieve the aquired local data [LocalLockLocalData] of  a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -124,7 +214,7 @@ impl<T: Dist> Future for LocalLockReadHandle<T> {
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
 ///
-/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.read_local_data();
 /// world.spawn(async move {
 ///     let  local_data = handle.await;
@@ -141,6 +231,15 @@ pub struct LocalLockLocalDataHandle<T: Dist> {
     pub(crate) lock_handle: LocalRwDarcReadHandle<()>,
 }
 
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for LocalLockLocalDataHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.lock_handle.launched {
+            RuntimeWarning::DroppedHandle("a LocalLockLocalDataHandle").print();
+        }
+    }
+}
+
 impl<T: Dist> LocalLockLocalDataHandle<T> {
     /// Blocks the calling thread to retrieve the aquired local data [LocalLockLocalData] of a LocalLockArray within a non async context
     ///
@@ -151,7 +250,7 @@ impl<T: Dist> LocalLockLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_local_data();
     /// let  local_data = handle.block();
     /// println!("local data: {:?}",local_data);
@@ -175,7 +274,7 @@ impl<T: Dist> LocalLockLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.read_local_data();
     /// let task = handle.spawn(); // initiate getting the read lock
     /// // do other work
@@ -206,7 +305,7 @@ impl<T: Dist> Future for LocalLockLocalDataHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project] //unused drop warning triggered by LocalRwDarcWriteHandle
+#[pin_project(PinnedDrop)] //unused drop warning triggered by LocalRwDarcWriteHandle
 /// Handle used to retrieve the aquired write lock of a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -219,7 +318,7 @@ impl<T: Dist> Future for LocalLockLocalDataHandle<T> {
 /// use lamellar::array::prelude::*;
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.write_lock();
 /// let task = world.spawn(async move {
 ///     let write_lock = handle.await;
@@ -234,6 +333,15 @@ pub struct LocalLockWriteHandle<T> {
     pub(crate) lock_handle: LocalRwDarcWriteHandle<()>,
 }
 
+#[pinned_drop]
+impl<T> PinnedDrop for LocalLockWriteHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.lock_handle.launched {
+            RuntimeWarning::DroppedHandle("a LocalRwDarcWriteHandle").print();
+        }
+    }
+}
+
 impl<T: Dist> LocalLockWriteHandle<T> {
     pub(crate) fn new(array: LocalLockArray<T>) -> Self {
         Self {
@@ -249,7 +357,7 @@ impl<T: Dist> LocalLockWriteHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_lock();
     /// handle.block();
     ///```
@@ -272,7 +380,7 @@ impl<T: Dist> LocalLockWriteHandle<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_lock();
     /// let task = handle.spawn(); // initiate getting the write lock
     /// //do other work
@@ -299,7 +407,7 @@ impl<T: Dist> Future for LocalLockWriteHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project] // unused drop warning triggered by LocalRwDarcWriteHandle
+#[pin_project(PinnedDrop)] // unused drop warning triggered by LocalRwDarcWriteHandle
 /// Handle used to retrieve the aquired mutable local data [LocalLockMutLocalData] of  a LocalLockArray
 ///
 /// This handle must be awaited or blocked on to acquire the lock
@@ -313,7 +421,7 @@ impl<T: Dist> Future for LocalLockWriteHandle<T> {
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
 ///
-/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
 /// let handle = array.write_local_data();
 /// world.spawn(async move {
 ///     let mut local_data = handle.await;
@@ -330,6 +438,15 @@ pub struct LocalLockMutLocalDataHandle<T: Dist> {
     pub(crate) lock_handle: LocalRwDarcWriteHandle<()>,
 }
 
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for LocalLockMutLocalDataHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.lock_handle.launched {
+            RuntimeWarning::DroppedHandle("a LocalLockMutLocalDataHandle").print();
+        }
+    }
+}
+
 impl<T: Dist> LocalLockMutLocalDataHandle<T> {
     /// Blocks the calling thread to retrieve the aquired mutable local data [LocalLockMutLocalData] of a LocalLockArray within a non async context
     ///
@@ -340,7 +457,7 @@ impl<T: Dist> LocalLockMutLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_local_data();
     /// let mut local_data = handle.block();
     /// local_data.iter_mut().for_each(|elem| *elem += my_pe);
@@ -365,7 +482,7 @@ impl<T: Dist> LocalLockMutLocalDataHandle<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     /// let handle = array.write_local_data();
     /// let task = handle.spawn(); // initiate getting the write lock
     /// //do other work
@@ -396,7 +513,7 @@ impl<T: Dist> Future for LocalLockMutLocalDataHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project] //unused drop warning triggered by LocalRwDarcReadHandle
+#[pin_project(PinnedDrop)] //unused drop warning triggered by LocalRwDarcReadHandle
 /// Constructs a handle for immutably iterating over fixed sized chunks(slices) of the local data of this array.
 /// This handle must be either await'd in an async context or block'd in an non-async context.
 /// Awaiting or blocking will not return until the read lock has been acquired.
@@ -408,7 +525,7 @@ impl<T: Dist> Future for LocalLockMutLocalDataHandle<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
 /// let my_pe = world.my_pe();
 /// //block in a non-async context
 /// let _ = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
@@ -432,6 +549,15 @@ pub struct LocalLockLocalChunksHandle<T> {
     pub(crate) lock_handle: LocalRwDarcReadHandle<()>,
 }
 
+#[pinned_drop]
+impl<T> PinnedDrop for LocalLockLocalChunksHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.lock_handle.launched {
+            RuntimeWarning::DroppedHandle("a LocalLockLocalChunksHandle").print();
+        }
+    }
+}
+
 impl<T: Dist> LocalLockLocalChunksHandle<T> {
     /// Blocks the calling thread to retrieve the aquired immutable local chunks iterator of a LocalLockArray within a non async context
     ///
@@ -440,7 +566,7 @@ impl<T: Dist> LocalLockLocalChunksHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// //block in a non-async context
     /// let _ = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
@@ -466,7 +592,7 @@ impl<T: Dist> LocalLockLocalChunksHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// //block in a non-async context
     /// let iter_task = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
@@ -499,7 +625,7 @@ impl<T: Dist> Future for LocalLockLocalChunksHandle<T> {
 }
 
 #[must_use = "LocalLockArray lock handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
-#[pin_project] // unused drop warning triggered by LocalRwDarcWriteHandle
+#[pin_project(PinnedDrop)] // unused drop warning triggered by LocalRwDarcWriteHandle
 /// A handle for mutably iterating over fixed sized chunks(slices) of the local data of this array.
 /// This handle must be either await'd in an async context or block'd in an non-async context.
 /// Awaiting or blocking will not return until the write lock has been acquired.
@@ -511,7 +637,7 @@ impl<T: Dist> Future for LocalLockLocalChunksHandle<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+/// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
 /// let my_pe = world.my_pe();
 /// let _ = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
 ///         for elem in chunk.iter_mut() {
@@ -535,6 +661,15 @@ pub struct LocalLockLocalChunksMutHandle<T> {
     pub(crate) lock_handle: LocalRwDarcWriteHandle<()>,
 }
 
+#[pinned_drop]
+impl<T> PinnedDrop for LocalLockLocalChunksMutHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.lock_handle.launched {
+            RuntimeWarning::DroppedHandle("a LocalLockLocalChunksMutHandle").print();
+        }
+    }
+}
+
 impl<T: Dist> LocalLockLocalChunksMutHandle<T> {
     /// Blocks the calling thread to retrieve the aquired mutable local chunks iterator of a LocalLockArray within a non async context
     ///
@@ -543,7 +678,7 @@ impl<T: Dist> LocalLockLocalChunksMutHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// //block in a non-async context
     /// let _ = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
@@ -571,7 +706,7 @@ impl<T: Dist> LocalLockLocalChunksMutHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// //block in a non-async context
     /// let iter_task = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index d4942409..d99401e6 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -8,7 +8,7 @@ use crate::array::local_lock_atomic::*;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
-use crate::darc::local_rw_darc::{LocalRwDarcReadHandle, LocalRwDarcWriteGuard};
+use crate::darc::local_rw_darc::LocalRwDarcWriteGuard;
 use crate::memregion::Dist;
 
 use self::iterator::IterLockFuture;
diff --git a/src/array/local_lock_atomic/local_chunks.rs b/src/array/local_lock_atomic/local_chunks.rs
index fc7aafa5..d8ccf65c 100644
--- a/src/array/local_lock_atomic/local_chunks.rs
+++ b/src/array/local_lock_atomic/local_chunks.rs
@@ -240,7 +240,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// //block in a non-async context
     /// let _ = array.read_local_chunks(5).block().enumerate().for_each(move|(i,chunk)| {
@@ -276,7 +276,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block);
+    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     /// let _ = array.write_local_chunks(5).block().enumerate().for_each(move|(i, mut chunk)| {
     ///         for elem in chunk.iter_mut() {
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index 3b3cf9d1..319291dd 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -27,7 +27,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index b0cc6481..c2ee7c3b 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1,7 +1,11 @@
+mod handle;
+pub(crate) use handle::NativeAtomicArrayHandle;
+
 pub(crate) mod iteration;
 pub(crate) mod operations;
 mod rdma;
 use crate::array::atomic::AtomicElement;
+
 // use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
@@ -923,14 +927,25 @@ impl<T: Dist + ArrayOps + std::default::Default> NativeAtomicArray<T> {
         team: U,
         array_size: usize,
         distribution: Distribution,
-    ) -> NativeAtomicArray<T> {
+    ) -> NativeAtomicArrayHandle<T> {
         // println!("new native atomic array 1");
-        let array = UnsafeArray::new(team.clone(), array_size, distribution);
-        array.block_on_outstanding(DarcMode::NativeAtomicArray);
-
-        NativeAtomicArray {
-            array: array,
-            orig_t: NativeAtomicType::from::<T>(),
+        // let array = UnsafeArray::new(team.clone(), array_size, distribution);
+        // array.block_on_outstanding(DarcMode::NativeAtomicArray);
+
+        // NativeAtomicArray {
+        //     array: array,
+        //     orig_t: NativeAtomicType::of::<T>(),
+        // }
+        let team = team.into().team.clone();
+        NativeAtomicArrayHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(UnsafeArray::async_new(
+                team,
+                array_size,
+                distribution,
+                DarcMode::NativeAtomicArray,
+            )),
         }
     }
 }
@@ -1022,7 +1037,7 @@ impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
 
         NativeAtomicArray {
             array: array,
-            orig_t: NativeAtomicType::from::<T>(),
+            orig_t: NativeAtomicType::of::<T>(),
         }
     }
 }
@@ -1038,7 +1053,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for NativeAtomicArray<T> {
 
         NativeAtomicArray {
             array: array,
-            orig_t: NativeAtomicType::from::<T>(),
+            orig_t: NativeAtomicType::of::<T>(),
         }
     }
 }
@@ -1333,7 +1348,7 @@ pub enum NativeAtomicType {
 
 //#[doc(hidden)]
 impl NativeAtomicType {
-    fn from<T: 'static>() -> NativeAtomicType {
+    pub(crate) fn of<T: 'static>() -> NativeAtomicType {
         let t = TypeId::of::<T>();
         if t == TypeId::of::<i8>() {
             NativeAtomicType::I8
diff --git a/src/array/native_atomic/handle.rs b/src/array/native_atomic/handle.rs
new file mode 100644
index 00000000..fb985636
--- /dev/null
+++ b/src/array/native_atomic/handle.rs
@@ -0,0 +1,96 @@
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::{ArrayOps, NativeAtomicArray, NativeAtomicType};
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
+use crate::{Dist, LamellarTeamRT, UnsafeArray};
+
+use futures_util::{ready, Future};
+use pin_project::{pin_project, pinned_drop};
+
+#[must_use = " NativeAtomicArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [NativeAtomicArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the NativeAtomicArray's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `NativeAtomicArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array: NativeAtomicArray<usize> = NativeAtomicArray::new(&world,100,Distribution::Cyclic).block();
+/// ```
+pub(crate) struct NativeAtomicArrayHandle<T: Dist + ArrayOps + 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = UnsafeArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist + ArrayOps + 'static> PinnedDrop for NativeAtomicArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a NativeAtomicArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> NativeAtomicArrayHandle<T> {
+    /// Used to drive creation of a new NativeAtomicArray
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: NativeAtomicArray<usize> = NativeAtomicArray::new(&world,100,Distribution::Cyclic).block();
+    pub fn block(mut self) -> NativeAtomicArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "NativeAtomicArrayHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the NativeAtomicArray on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array_task: NativeAtomicArray<usize> = NativeAtomicArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// // do some other work
+    /// let array = array_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<NativeAtomicArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Future for NativeAtomicArrayHandle<T> {
+    type Output = NativeAtomicArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        let array = ready!(this.creation_future.as_mut().poll(cx));
+        Poll::Ready(NativeAtomicArray {
+            array,
+            orig_t: NativeAtomicType::of::<T>(),
+        })
+    }
+}
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index 84553794..6d1cdfde 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -24,7 +24,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/operations/access.rs b/src/array/operations/access.rs
index 8a189fd6..a143169e 100644
--- a/src/array/operations/access.rs
+++ b/src/array/operations/access.rs
@@ -36,7 +36,7 @@ use super::handle::{
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let val = 10;
@@ -48,7 +48,7 @@ use super::handle::{
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let vals = vec![3,54,12,88,29,68];
 /// let index = 10;
@@ -62,7 +62,7 @@ use super::handle::{
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let vals = vec![12,2,1,10000,12,13];
@@ -85,7 +85,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -118,7 +118,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_store(indices,10);
@@ -155,7 +155,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let new = 10;
@@ -189,7 +189,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_swap(indices,10);
@@ -242,7 +242,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let val = 10;
@@ -254,7 +254,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let vals = vec![3,54,12,88,29,68];
 /// let index = 10;
@@ -268,7 +268,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let vals = vec![12,2,1,10000,12,13];
@@ -291,7 +291,7 @@ pub trait UnsafeAccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -324,7 +324,7 @@ pub trait UnsafeAccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{array.batch_store(indices,10)};
@@ -361,7 +361,7 @@ pub trait UnsafeAccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let new = 10;
@@ -395,7 +395,7 @@ pub trait UnsafeAccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{array.batch_swap(indices,10)};
diff --git a/src/array/operations/arithmetic.rs b/src/array/operations/arithmetic.rs
index 635e36ab..608b6a47 100644
--- a/src/array/operations/arithmetic.rs
+++ b/src/array/operations/arithmetic.rs
@@ -64,7 +64,7 @@ pub trait ElementArithmeticOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let val = 10;
@@ -76,7 +76,7 @@ pub trait ElementArithmeticOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let vals = vec![3,54,12,88,29,68];
 /// let index = 10;
@@ -90,7 +90,7 @@ pub trait ElementArithmeticOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let vals = vec![12,2,1,10000,12,13];
@@ -113,7 +113,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -149,7 +149,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_add(indices,10);
@@ -187,7 +187,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -226,7 +226,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_add(indices,10);
@@ -262,7 +262,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -298,7 +298,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_sub(indices,10);
@@ -336,7 +336,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -375,7 +375,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_sub(indices,10);
@@ -411,7 +411,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -447,7 +447,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_mul(indices,10);
@@ -485,7 +485,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -524,7 +524,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_mul(indices,10);
@@ -560,7 +560,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -596,7 +596,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_div(indices,10);
@@ -634,7 +634,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -673,7 +673,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_div(indices,10);
@@ -709,7 +709,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -745,7 +745,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_rem(indices,10);
@@ -783,7 +783,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -822,7 +822,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_rem(indices,10);
@@ -875,7 +875,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let val = 10;
@@ -887,7 +887,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block).block();
 ///
 /// let vals = vec![3,54,12,88,29,68];
 /// let index = 10;
@@ -901,7 +901,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let vals = vec![12,2,1,10000,12,13];
@@ -926,7 +926,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -962,7 +962,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_add(indices,10) };
@@ -1000,7 +1000,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1039,7 +1039,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_add(indices,10) };
@@ -1075,7 +1075,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1111,7 +1111,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_sub(indices,10) };
@@ -1149,7 +1149,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1188,7 +1188,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<isize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_sub(indices,10) };
@@ -1224,7 +1224,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1260,7 +1260,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_mul(indices,10) };
@@ -1298,7 +1298,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1337,7 +1337,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_mul(indices,10) };
@@ -1373,7 +1373,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1409,7 +1409,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_div(indices,10) };
@@ -1447,7 +1447,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1486,7 +1486,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_div(indices,10) };
@@ -1522,7 +1522,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1558,7 +1558,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_rem(indices,10) };
@@ -1596,7 +1596,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1635,7 +1635,7 @@ pub trait UnsafeArithmeticOps<T: Dist + ElementArithmeticOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_rem(indices,10) };
diff --git a/src/array/operations/bitwise.rs b/src/array/operations/bitwise.rs
index 0c1b7f4d..96b09df7 100644
--- a/src/array/operations/bitwise.rs
+++ b/src/array/operations/bitwise.rs
@@ -51,7 +51,7 @@ pub trait ElementBitWiseOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let val = 0b100101001;
@@ -63,7 +63,7 @@ pub trait ElementBitWiseOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let vals = vec![0x3,0x54,0b11101,88,29,0x68];
 /// let index = 10;
@@ -77,7 +77,7 @@ pub trait ElementBitWiseOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let vals = vec![0x12,2,1,0b10000,12,0x13];
@@ -100,7 +100,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 0b100101001;
@@ -136,7 +136,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_bit_and(indices,10);
@@ -174,7 +174,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -213,7 +213,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_bit_and(indices,10);
@@ -249,7 +249,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 0b100101001;
@@ -285,7 +285,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_bit_or(indices,10);
@@ -323,7 +323,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -362,7 +362,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_bit_or(indices,10);
@@ -398,7 +398,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 0b100101001;
@@ -434,7 +434,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_bit_xor(indices,10);
@@ -472,7 +472,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -511,7 +511,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_bit_xor(indices,10);
@@ -564,7 +564,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let val = 0b100101001;
@@ -576,7 +576,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let vals = vec![0x3,0x54,0b11101,88,29,0x68];
 /// let index = 10;
@@ -590,7 +590,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let vals = vec![0x12,2,1,0b10000,12,0x13];
@@ -613,7 +613,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 0b100101001;
@@ -649,7 +649,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_bit_and(indices,10)};
@@ -687,7 +687,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -726,7 +726,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_bit_and(indices,10)};
@@ -762,7 +762,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 0b100101001;
@@ -798,7 +798,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_bit_or(indices,10)};
@@ -836,7 +836,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -875,7 +875,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_bit_or(indices,10)};
@@ -911,7 +911,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 0b100101001;
@@ -947,7 +947,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_bit_xor(indices,10)};
@@ -985,7 +985,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -1024,7 +1024,7 @@ pub trait UnsafeBitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_bit_xor(indices,10)};
diff --git a/src/array/operations/compare_exchange.rs b/src/array/operations/compare_exchange.rs
index 93497a82..364b0c92 100644
--- a/src/array/operations/compare_exchange.rs
+++ b/src/array/operations/compare_exchange.rs
@@ -57,7 +57,7 @@ pub trait ElementComparePartialEqOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let current = 0;
@@ -70,7 +70,7 @@ pub trait ElementComparePartialEqOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let new_vals = vec![3,54,11101,88,29,68];
 /// let current = 0;
@@ -85,7 +85,7 @@ pub trait ElementComparePartialEqOps:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let new_vals = vec![12,2,1,10000,12,13];
@@ -112,7 +112,7 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -153,7 +153,7 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let current = 0;
@@ -214,7 +214,7 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,11,88,29,68];
 /// let current = 0.0;
@@ -228,7 +228,7 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block).block();
 ///
 /// let new_vals = vec![3.0,54.8,12.9,88.1,29.2,68.9];
 /// let current = 0.0;
@@ -244,7 +244,7 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let new_vals = vec![12.1,2.321,1.7,10000.0,12.4,13.7];
@@ -276,7 +276,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10.3;
@@ -324,7 +324,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<f32>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let current = 0.0;
@@ -384,7 +384,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let current = 0;
@@ -397,7 +397,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let new_vals = vec![3,54,11101,88,29,68];
 /// let current = 0;
@@ -412,7 +412,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let new_vals = vec![12,2,1,10000,12,13];
@@ -441,7 +441,7 @@ pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10;
@@ -487,7 +487,7 @@ pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let current = 0;
@@ -548,7 +548,7 @@ pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,11,88,29,68];
 /// let current = 0.0;
@@ -562,7 +562,7 @@ pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block).block();
 ///
 /// let new_vals = vec![3.0,54.8,12.9,88.1,29.2,68.9];
 /// let current = 0.0;
@@ -578,7 +578,7 @@ pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let new_vals = vec![12.1,2.321,1.7,10000.0,12.4,13.7];
@@ -610,7 +610,7 @@ pub trait UnsafeCompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 10.3;
@@ -658,7 +658,7 @@ pub trait UnsafeCompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<f32>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let current = 0.0;
diff --git a/src/array/operations/handle.rs b/src/array/operations/handle.rs
index 4d4fefdf..6fe5af10 100644
--- a/src/array/operations/handle.rs
+++ b/src/array/operations/handle.rs
@@ -196,7 +196,7 @@ pub struct ArrayFetchBatchOpHandle<R: AmDist> {
     results: Vec<R>,
 }
 
-enum FetchBatchOpState<R> {
+pub(crate) enum FetchBatchOpState<R> {
     Reqs(VecDeque<(AmHandle<Vec<R>>, Vec<usize>)>),
     Launched(VecDeque<(LamellarTask<Vec<R>>, Vec<usize>)>),
 }
diff --git a/src/array/operations/read_only.rs b/src/array/operations/read_only.rs
index ee4c9c83..ccf516bd 100644
--- a/src/array/operations/read_only.rs
+++ b/src/array/operations/read_only.rs
@@ -32,7 +32,7 @@ use super::handle::{ArrayFetchBatchOpHandle, ArrayFetchOpHandle};
 /// use futures_util::future::join_all;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let reqs = indices.iter().map(|i| array.load(*i)).collect::<Vec<_>>();
@@ -64,7 +64,7 @@ pub trait ReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.load(53);
     /// let val = array.block_on(req);
@@ -104,7 +104,7 @@ pub trait ReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_load(indices.clone());
@@ -153,7 +153,7 @@ pub trait ReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
 /// use futures_util::future::join_all;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// let reqs = indices.iter().map(|i| unsafe{array.load(*i)}).collect::<Vec<_>>();
@@ -185,7 +185,7 @@ pub trait UnsafeReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let req = unsafe{ array.load(53)};
     /// let val = array.block_on(req);
@@ -225,7 +225,7 @@ pub trait UnsafeReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_load(indices.clone())};
diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index 81a083e1..0b3b392b 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -45,7 +45,7 @@ pub trait ElementShiftOps: std::ops::ShlAssign + std::ops::ShrAssign + Dist + Si
 /// use lamellar::array::prelude::*;
 ///
 /// let world = LamellarWorldBuilder::new().build();
-/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+/// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
 /// array.block_on(array.batch_fetch_shl(indices,2));
@@ -67,7 +67,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -103,7 +103,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_shl(indices,3);
@@ -141,7 +141,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -180,7 +180,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_shl(indices,10);
@@ -216,7 +216,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -252,7 +252,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_shr(indices,3);
@@ -290,7 +290,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -329,7 +329,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = array.batch_fetch_shr(indices,10);
@@ -367,7 +367,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -403,7 +403,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_shl(indices,3) };
@@ -441,7 +441,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -480,7 +480,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_shl(indices,10) };
@@ -516,7 +516,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -552,7 +552,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_shr(indices,3) };
@@ -590,7 +590,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let idx = 53;
     /// let val = 2;
@@ -629,7 +629,7 @@ pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let indices = vec![3,54,12,88,29,68];
     /// let req = unsafe{ array.batch_fetch_shr(indices,10) };
diff --git a/src/array/prelude.rs b/src/array/prelude.rs
index 055c900e..0823236a 100644
--- a/src/array/prelude.rs
+++ b/src/array/prelude.rs
@@ -1,10 +1,10 @@
-pub use crate::array::atomic::AtomicArray;
-pub use crate::array::generic_atomic::GenericAtomicArray;
-pub use crate::array::global_lock_atomic::GlobalLockArray;
-pub use crate::array::local_lock_atomic::LocalLockArray;
-pub use crate::array::native_atomic::NativeAtomicArray;
-pub use crate::array::r#unsafe::UnsafeArray;
-pub use crate::array::read_only::ReadOnlyArray;
+pub use crate::array::atomic::{AtomicArray,AtomicArrayHandle};
+pub use crate::array::generic_atomic::{GenericAtomicArray};
+pub use crate::array::global_lock_atomic::{GlobalLockArray,handle::GlobalLockArrayHandle};
+pub use crate::array::local_lock_atomic::{LocalLockArray,handle::LocalLockArrayHandle};
+pub use crate::array::native_atomic::{NativeAtomicArray};
+pub use crate::array::r#unsafe::{UnsafeArray,UnsafeArrayHandle};
+pub use crate::array::read_only::{ReadOnlyArray,ReadOnlyArrayHandle};
 //#[doc(hidden)]
 pub use crate::array::{
     register_reduction,
@@ -32,7 +32,9 @@ pub use crate::array::operations::{
     AccessOps, ArithmeticOps, ArrayOps as _ArrayOps, BitWiseOps, CompareExchangeEpsilonOps,
     CompareExchangeOps, ElementArithmeticOps, ElementBitWiseOps, ElementCompareEqOps,
     ElementComparePartialEqOps, ElementOps, ElementShiftOps, LocalArithmeticOps, LocalAtomicOps,
-    LocalBitWiseOps, LocalShiftOps, OpInput, ReadOnlyOps, ShiftOps, UnsafeAccessOps, UnsafeArithmeticOps, UnsafeBitWiseOps, UnsafeCompareExchangeEpsilonOps, UnsafeCompareExchangeOps, UnsafeShiftOps, UnsafeReadOnlyOps
+    LocalBitWiseOps, LocalShiftOps, OpInput, ReadOnlyOps, ShiftOps, UnsafeAccessOps,
+    UnsafeArithmeticOps, UnsafeBitWiseOps, UnsafeCompareExchangeEpsilonOps,
+    UnsafeCompareExchangeOps, UnsafeReadOnlyOps, UnsafeShiftOps,
 };
 // pub use crate::array::operations::*;
 
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 56dc8935..9e359de1 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -1,3 +1,6 @@
+pub(crate) mod handle;
+pub use handle::ReadOnlyArrayHandle;
+
 mod iteration;
 pub(crate) mod local_chunks;
 pub use local_chunks::ReadOnlyLocalChunks;
@@ -69,16 +72,23 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     pub fn new<U: Into<IntoLamellarTeam>>(
         team: U,
         array_size: usize,
         distribution: Distribution,
-    ) -> ReadOnlyArray<T> {
-        let array = UnsafeArray::new(team, array_size, distribution);
-        array.block_on_outstanding(DarcMode::ReadOnlyArray);
-
-        ReadOnlyArray { array: array }
+    ) -> ReadOnlyArrayHandle<T> {
+        let team = team.into().team.clone();
+        ReadOnlyArrayHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(UnsafeArray::async_new(
+                team,
+                array_size,
+                distribution,
+                DarcMode::ReadOnlyArray,
+            )),
+        }
     }
 
     #[doc(alias("One-sided", "onesided"))]
@@ -91,9 +101,9 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block);
+    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         ReadOnlyArray {
@@ -114,7 +124,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let slice = array.local_as_slice();
     /// println!("PE{my_pe} data: {slice:?}");
@@ -136,7 +146,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let slice = array.local_as_slice();
     /// println!("PE{my_pe} data: {slice:?}");
@@ -164,7 +174,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let unsafe_array = array.into_unsafe();
     ///```
@@ -175,7 +185,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.local_data();
@@ -214,7 +224,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_lock_array = array.into_local_lock();
     ///```
@@ -224,7 +234,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = unsafe {array1.local_data()};
@@ -258,7 +268,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let global_lock_array = array.into_global_lock();
     ///```
@@ -268,7 +278,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = unsafe {array1.local_data()};
@@ -304,7 +314,7 @@ impl<T: Dist + 'static> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let atomic_array = array.into_local_lock();
     ///```
@@ -314,7 +324,7 @@ impl<T: Dist + 'static> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = array1.local_data();
@@ -454,7 +464,7 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -487,7 +497,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// use rand::Rng;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// let _ = array.local_iter().for_each(move |_| {
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
@@ -518,7 +528,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| {
     ///     elem.store(i+1);
     /// }).block();
@@ -548,7 +558,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
     /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
@@ -575,7 +585,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
     /// array.wait_all();
     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
@@ -758,8 +768,8 @@ impl<T: Dist + std::fmt::Debug> ReadOnlyArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let block_array = ReadOnlyArray::<usize>::new(&world,100,Distribution::Block);
-    /// let cyclic_array = ReadOnlyArray::<usize>::new(&world,100,Distribution::Block);
+    /// let block_array = ReadOnlyArray::<usize>::new(&world,100,Distribution::Block).block();
+    /// let cyclic_array = ReadOnlyArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// block_array.print();
     /// println!();
diff --git a/src/array/read_only/handle.rs b/src/array/read_only/handle.rs
new file mode 100644
index 00000000..95691682
--- /dev/null
+++ b/src/array/read_only/handle.rs
@@ -0,0 +1,93 @@
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::{ArrayOps, ReadOnlyArray};
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
+use crate::{Dist, LamellarTeamRT, UnsafeArray};
+
+use futures_util::{ready, Future};
+use pin_project::{pin_project, pinned_drop};
+
+#[must_use = " ReadOnlyArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [ReadOnlyArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the ReadOnlyArray's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `ReadOnlyArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
+/// ```
+pub struct ReadOnlyArrayHandle<T: Dist + ArrayOps + 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = UnsafeArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist + ArrayOps + 'static> PinnedDrop for ReadOnlyArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a ReadOnlyArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> ReadOnlyArrayHandle<T> {
+    /// Used to drive creation of a new ReadOnlyArray
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
+    pub fn block(mut self) -> ReadOnlyArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "ReadOnlyArrayHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the ReadOnlyArray on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array_task: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// // do some other work
+    /// let array = array_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<ReadOnlyArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Future for ReadOnlyArrayHandle<T> {
+    type Output = ReadOnlyArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        let array = ready!(this.creation_future.as_mut().poll(cx));
+        Poll::Ready(ReadOnlyArray { array })
+    }
+}
diff --git a/src/array/read_only/local_chunks.rs b/src/array/read_only/local_chunks.rs
index 0a881af9..f8c09300 100644
--- a/src/array/read_only/local_chunks.rs
+++ b/src/array/read_only/local_chunks.rs
@@ -102,7 +102,7 @@ impl<T: Dist> ReadOnlyArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,40,Distribution::Block);
+    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// let _ = array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
diff --git a/src/array/read_only/rdma.rs b/src/array/read_only/rdma.rs
index f9cdbad1..5bccf4d8 100644
--- a/src/array/read_only/rdma.rs
+++ b/src/array/read_only/rdma.rs
@@ -22,7 +22,7 @@ impl<T: Dist> ReadOnlyArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = ReadOnlyArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = ReadOnlyArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
     ///     for elem in buf.as_mut_slice()
@@ -88,7 +88,7 @@ impl<T: Dist> ReadOnlyArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = ReadOnlyArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = ReadOnlyArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
     ///     for elem in buf.as_mut_slice()
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 67eff7cf..7761c9aa 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -1,10 +1,11 @@
+pub(crate) mod handle;
 mod iteration;
-
 pub(crate) mod local_chunks;
-// pub use local_chunks::{};
 pub(crate) mod operations;
 mod rdma;
 
+pub use handle::UnsafeArrayHandle;
+
 use crate::active_messaging::ActiveMessaging;
 use crate::active_messaging::*;
 // use crate::array::r#unsafe::operations::BUFOPS;
@@ -150,118 +151,31 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     pub fn new<U: Into<IntoLamellarTeam>>(
         team: U,
         array_size: usize,
         distribution: Distribution,
-    ) -> UnsafeArray<T> {
+    ) -> UnsafeArrayHandle<T> {
+        // let temp_team = team.into();
         let team = team.into().team.clone();
-        team.tasking_barrier();
-        let task_group = LamellarTaskGroup::new(team.clone());
-        let my_pe = team.team_pe_id().unwrap();
-        let num_pes = team.num_pes();
-        let full_array_size = std::cmp::max(array_size, num_pes);
-
-        let elem_per_pe = full_array_size / num_pes;
-        let remaining_elems = full_array_size % num_pes;
-        let mut per_pe_size = elem_per_pe;
-        if remaining_elems > 0 {
-            per_pe_size += 1
+        UnsafeArrayHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(UnsafeArray::async_new(
+                team,
+                array_size,
+                distribution,
+                DarcMode::UnsafeArray,
+            )),
         }
-        // println!("new unsafe array {:?} {:?}", elem_per_pe, per_pe_size);
-        let rmr_t: MemoryRegion<T> = if team.num_world_pes == team.num_pes {
-            MemoryRegion::new(per_pe_size, team.lamellae.clone(), AllocationType::Global)
-        } else {
-            MemoryRegion::new(
-                per_pe_size,
-                team.lamellae.clone(),
-                AllocationType::Sub(team.get_pes()),
-            )
-        };
-        // let rmr = MemoryRegion::new(
-        //     per_pe_size * std::mem::size_of::<T>(),
-        //     team.lamellae.clone(),
-        //     AllocationType::Global,
-        // );
-        // println!("new array {:?}",rmr_t.as_ptr());
-
-        unsafe {
-            // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
-            //     *elem = std::mem::zeroed();
-            // }
-            if std::mem::needs_drop::<T>() {
-                // If `T` needs to be dropped then we have to do this one item at a time, in
-                // case one of the intermediate drops does a panic.
-                // slice.iter_mut().for_each(write_zeroes);
-                panic!("need drop not yet supported");
-            } else {
-                // Otherwise we can be really fast and just fill everthing with zeros.
-                let len = std::mem::size_of_val::<[T]>(
-                    rmr_t.as_mut_slice().expect("data should exist on pe"),
-                );
-                std::ptr::write_bytes(
-                    rmr_t.as_mut_ptr().expect("data should exist on pe") as *mut u8,
-                    0u8,
-                    len,
-                )
-            }
-        }
-        let rmr = unsafe { rmr_t.to_base::<u8>() };
-        // println!("new array u8 {:?}",rmr.as_ptr());
-
-        let data = Darc::try_new_with_drop(
-            team.clone(),
-            UnsafeArrayData {
-                mem_region: rmr,
-                array_counters: Arc::new(AMCounters::new()),
-                team: team.clone(),
-                task_group: Arc::new(task_group),
-                my_pe: my_pe,
-                num_pes: num_pes,
-                req_cnt: Arc::new(AtomicUsize::new(0)),
-            },
-            crate::darc::DarcMode::UnsafeArray,
-            None,
-        )
-        .expect("trying to create array on non team member");
-        // println!("new unsafe array darc {:?}", data);
-        // data.print();
-        let array = UnsafeArray {
-            inner: UnsafeArrayInner {
-                data: data,
-                distribution: distribution.clone(),
-                // wait: wait,
-                orig_elem_per_pe: elem_per_pe,
-                orig_remaining_elems: remaining_elems,
-                elem_size: std::mem::size_of::<T>(),
-                offset: 0,             //relative to size of T
-                size: full_array_size, //relative to size of T
-                sub: false,
-            },
-            phantom: PhantomData,
-        };
-        // println!("new unsafe");
-        // unsafe {println!("size {:?} bytes {:?}",array.inner.size, array.inner.data.mem_region.as_mut_slice().unwrap().len())};
-        // println!("elem per pe {:?}", elem_per_pe);
-        // for i in 0..num_pes{
-        //     println!("pe: {:?} {:?}",i,array.inner.num_elems_pe(i));
-        // }
-        // array.inner.data.print();
-        if full_array_size != array_size {
-            println!("WARNING: Array size {array_size} is less than number of pes {full_array_size}, each PE will not contain data");
-            array.sub_array(0..array_size)
-        } else {
-            array
-        }
-        // println!("after buffered ops");
-        // array.inner.data.print();
     }
 
     pub(crate) async fn async_new<U: Into<IntoLamellarTeam>>(
         team: U,
         array_size: usize,
         distribution: Distribution,
+        darc_mode: DarcMode,
     ) -> UnsafeArray<T> {
         let team = team.into().team.clone();
         team.async_barrier().await;
@@ -285,11 +199,6 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
                 AllocationType::Sub(team.get_pes()),
             )
         };
-        // let rmr = MemoryRegion::new(
-        //     per_pe_size * std::mem::size_of::<T>(),
-        //     team.lamellae.clone(),
-        //     AllocationType::Global,
-        // );
 
         unsafe {
             // for elem in rmr_t.as_mut_slice().expect("data should exist on pe") {
@@ -299,7 +208,7 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
                 // If `T` needs to be dropped then we have to do this one item at a time, in
                 // case one of the intermediate drops does a panic.
                 // slice.iter_mut().for_each(write_zeroes);
-                panic!("need drop not yet supported");
+                panic!("Lamellar Arrays do not yet support elements that impl Drop");
             } else {
                 // Otherwise we can be really fast and just fill everthing with zeros.
                 let len = std::mem::size_of_val::<[T]>(
@@ -314,7 +223,7 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
         }
         let rmr = unsafe { rmr_t.to_base::<u8>() };
 
-        let data = Darc::try_new_with_drop(
+        let data = Darc::async_try_new_with_drop(
             team.clone(),
             UnsafeArrayData {
                 mem_region: rmr,
@@ -325,15 +234,15 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArray<T> {
                 num_pes: num_pes,
                 req_cnt: Arc::new(AtomicUsize::new(0)),
             },
-            crate::darc::DarcMode::UnsafeArray,
+            darc_mode,
             None,
         )
+        .await
         .expect("trying to create array on non team member");
         let array = UnsafeArray {
             inner: UnsafeArrayInner {
                 data: data,
                 distribution: distribution.clone(),
-                // wait: wait,
                 orig_elem_per_pe: elem_per_pe,
                 orig_remaining_elems: remaining_elems,
                 elem_size: std::mem::size_of::<T>(),
@@ -371,9 +280,9 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block);
+    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
     ///```
     pub fn use_distribution(mut self, distribution: Distribution) -> Self {
         self.inner.distribution = distribution;
@@ -395,7 +304,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     let slice = array.local_as_slice();
@@ -421,7 +330,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     let slice =  array.local_as_mut_slice();
@@ -454,7 +363,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     /// unsafe {
     ///     let slice = array.local_data();
     ///     println!("PE{my_pe} data: {slice:?}");
@@ -479,7 +388,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     let slice = array.mut_local_data();
@@ -506,7 +415,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// assert_eq!(array.sub_array_range(),(0..100));
     ///
@@ -615,7 +524,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let read_only_array = array.into_read_only();
     ///```
@@ -626,7 +535,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let mut_slice = unsafe {array1.local_as_mut_slice()};
@@ -665,7 +574,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_lock_array = array.into_local_lock();
     ///```
@@ -675,7 +584,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let mut_slice = unsafe {array1.local_as_mut_slice()};
@@ -709,7 +618,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let global_lock_array = array.into_global_lock();
     ///```
@@ -719,7 +628,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let slice = unsafe {array1.local_data()};
@@ -763,7 +672,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let atomic_array = array.into_local_lock();
     ///```
@@ -773,7 +682,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let array1 = array.clone();
     /// let mut_slice = unsafe {array1.local_as_mut_slice()};
@@ -816,8 +725,13 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T
         let (local_vals, distribution) = input;
         // println!("local_vals len: {:?}", local_vals.len());
         team.async_barrier().await;
-        let local_sizes =
-            UnsafeArray::<usize>::async_new(team.clone(), team.num_pes, Distribution::Block).await;
+        let local_sizes = UnsafeArray::<usize>::async_new(
+            team.clone(),
+            team.num_pes,
+            Distribution::Block,
+            crate::darc::DarcMode::UnsafeArray,
+        )
+        .await;
         unsafe {
             local_sizes.local_as_mut_slice()[0] = local_vals.len();
         }
@@ -840,7 +754,13 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T
                 })
                 .await;
         }
-        let array = UnsafeArray::<T>::async_new(team.clone(), size, distribution).await;
+        let array = UnsafeArray::<T>::async_new(
+            team.clone(),
+            size,
+            distribution,
+            crate::darc::DarcMode::UnsafeArray,
+        )
+        .await;
         if local_vals.len() > 0 {
             unsafe { array.put(my_start, local_vals).await };
         }
@@ -853,9 +773,9 @@ impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
         let (local_vals, distribution) = input;
         // println!("local_vals len: {:?}", local_vals.len());
-        team.tasking_barrier();
+        // team.tasking_barrier();
         let local_sizes =
-            UnsafeArray::<usize>::new(team.clone(), team.num_pes, Distribution::Block);
+            UnsafeArray::<usize>::new(team.clone(), team.num_pes, Distribution::Block).block();
         unsafe {
             local_sizes.local_as_mut_slice()[0] = local_vals.len();
         }
@@ -875,7 +795,7 @@ impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
                     }
                 });
         }
-        let array = UnsafeArray::<T>::new(team.clone(), size, distribution);
+        let array = UnsafeArray::<T>::new(team.clone(), size, distribution).block();
         if local_vals.len() > 0 {
             array.block_on(unsafe { array.put(my_start, local_vals) });
         }
@@ -1346,8 +1266,8 @@ impl<T: Dist + std::fmt::Debug> UnsafeArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let block_array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
-    /// let cyclic_array = UnsafeArray::<usize>::new(&world,100,Distribution::Block);
+    /// let block_array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
+    /// let cyclic_array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// unsafe{
     ///     let _ =block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
@@ -1435,7 +1355,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// use rand::Rng;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array = AtomicArray::<usize>::new(&world,1000000,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
     ///     let req = array.local_iter().for_each(move |_| {
@@ -1473,7 +1393,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// use rand::Rng;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,1000000,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,1000000,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// unsafe { // THIS IS NOT SAFE -- we are randomly updating elements, no protections, updates may be lost... DONT DO THIS
     ///     let req = array.local_iter().for_each(move |_| {
@@ -1511,7 +1431,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// use rand::Rng;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block).block();
     /// unsafe {
     ///     let req = array.dist_iter_mut().enumerate().for_each(move |(i,elem)| {
     ///         *elem = i+1;
@@ -1547,7 +1467,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// let _ = unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2).spawn()}; //safe as we are accessing in a data parallel fashion
     /// array.wait_all();
@@ -1580,7 +1500,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let array_clone = array.clone();
     /// let _ = unsafe{array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i*2).spawn()}; //safe as we are accessing in a data parallel fashion
     /// array.wait_all();
diff --git a/src/array/unsafe/handle.rs b/src/array/unsafe/handle.rs
new file mode 100644
index 00000000..58773079
--- /dev/null
+++ b/src/array/unsafe/handle.rs
@@ -0,0 +1,93 @@
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::{ArrayOps, UnsafeArray};
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
+use crate::{Dist, LamellarTeamRT};
+
+use futures_util::{ready, Future};
+use pin_project::{pin_project, pinned_drop};
+
+#[must_use = " UnsafeArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [UnsafeArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the UnsafeArray's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `UnsafeArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
+/// ```
+pub struct UnsafeArrayHandle<T: Dist + ArrayOps + 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = UnsafeArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist + ArrayOps + 'static> PinnedDrop for UnsafeArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a UnsafeArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> UnsafeArrayHandle<T> {
+    /// Used to drive creation of a new UnsafeArray
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
+    pub fn block(mut self) -> UnsafeArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "UnsafeArrayHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the UnsafeArray on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array_task: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// // do some other work
+    /// let array = array_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<UnsafeArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist + ArrayOps + 'static> Future for UnsafeArrayHandle<T> {
+    type Output = UnsafeArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        let array = ready!(this.creation_future.as_mut().poll(cx));
+        Poll::Ready(array)
+    }
+}
diff --git a/src/array/unsafe/iteration.rs b/src/array/unsafe/iteration.rs
index 05011453..d3f9e4ad 100644
--- a/src/array/unsafe/iteration.rs
+++ b/src/array/unsafe/iteration.rs
@@ -26,7 +26,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     world.block_on(
@@ -53,7 +53,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     world.block_on(
@@ -81,7 +81,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     world.block_on(
@@ -109,7 +109,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     world.block_on(
@@ -136,7 +136,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     if my_pe == 0 {
@@ -171,7 +171,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
     /// unsafe {
     ///     if my_pe == 0 {
diff --git a/src/array/unsafe/local_chunks.rs b/src/array/unsafe/local_chunks.rs
index eb2760aa..b7350334 100644
--- a/src/array/unsafe/local_chunks.rs
+++ b/src/array/unsafe/local_chunks.rs
@@ -189,7 +189,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// let _ = unsafe{array.local_chunks(5).enumerate().for_each(move|(i,chunk)| {
@@ -215,7 +215,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block);
+    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,40,Distribution::Block).block();
     /// let my_pe = world.my_pe();
     ///
     /// unsafe{
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index fcaf8dea..8d62931c 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -173,7 +173,7 @@ impl<T: Dist> UnsafeArray<T> {
                     .inner
                     .data
                     .team
-                    .alloc_one_sided_mem_region::<T>(num_elems_pe);
+                    .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
                 unsafe {
                     for i in 0..std::cmp::min(buf.len(), num_pes) {
                         let mut k = 0;
@@ -200,7 +200,7 @@ impl<T: Dist> UnsafeArray<T> {
                         .inner
                         .data
                         .team
-                        .alloc_one_sided_mem_region::<T>(num_elems_pe);
+                        .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
                     let mut k = 0;
                     let pe = (start_pe + i) % num_pes;
                     // let offset = global_index / num_pes + overflow;
@@ -248,7 +248,7 @@ impl<T: Dist> UnsafeArray<T> {
                             .inner
                             .data
                             .team
-                            .alloc_one_sided_mem_region::<T>(num_elems_pe);
+                            .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
                         let rem = buf.len() % num_pes;
                         // let temp_buf: LamellarMemoryRegion<T> = buf.team_into(&self.inner.data.team);
                         for i in 0..std::cmp::min(buf.len(), num_pes) {
@@ -290,7 +290,7 @@ impl<T: Dist> UnsafeArray<T> {
                         .inner
                         .data
                         .team
-                        .alloc_one_sided_mem_region::<T>(num_elems_pe);
+                        .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
                     let pe = (start_pe + i) % num_pes;
                     let offset = global_index / num_pes + overflow;
                     let num_elems = (num_elems_pe - 1) + if i < rem { 1 } else { 0 };
@@ -371,7 +371,7 @@ impl<T: Dist> UnsafeArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// let buf_len = buf.len();
     /// unsafe {
@@ -451,7 +451,7 @@ impl<T: Dist> UnsafeArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
     ///     let _ = array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn();
@@ -527,7 +527,7 @@ impl<T: Dist> UnsafeArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
     ///     let _ =array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn(); //we will used this val as completion detection
@@ -593,7 +593,7 @@ impl<T: Dist> UnsafeArray<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block).block();
     /// let buf = world.alloc_one_sided_mem_region::<usize>(12);
     /// unsafe {
     ///     let _ = array.dist_iter_mut().enumerate().for_each(|(i,elem)| *elem = i).spawn(); //we will used this val as completion detection
@@ -639,7 +639,7 @@ impl<T: Dist> UnsafeArray<T> {
     }
 
     pub(crate) unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.team_rt().alloc_one_sided_mem_region(1);
+        let buf: OneSidedMemoryRegion<T> = self.team_rt().alloc_one_sided_mem_region_or_panic(1);
         self.blocking_get(index, &buf);
         ArrayRdmaAtHandle {
             array: self.as_lamellar_byte_array(),
@@ -669,7 +669,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
-    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block);
+    /// let array = UnsafeArray::<usize>::new(&world,12,Distribution::Block).block();
     /// unsafe {
     ///     let _ = array.dist_iter_mut().enumerate().for_each(move|(i,elem)| *elem = my_pe).spawn(); //we will used this val as completion detection
     ///     array.wait_all();
diff --git a/src/barrier.rs b/src/barrier.rs
index b418c42f..2d8faa40 100644
--- a/src/barrier.rs
+++ b/src/barrier.rs
@@ -136,25 +136,25 @@ impl Barrier {
         RuntimeWarning::BarrierTimeout(s.elapsed().as_secs_f64()).print();
 
         if s.elapsed().as_secs_f64() > config().deadlock_timeout {
-            // println!(
-            //     "[{:?}][{:?}, {:?}] round: {:?} i: {:?} teamsend_pe: {:?} team_recv_pe: {:?} recv_pe: {:?} id: {:?} buf {:?}",
-            //     std::thread::current().id(),
-            //     self.my_pe,
-            //     my_index,
-            //     round,
-            //     i,
-            //     (my_index + i * (self.n + 1).pow(round as u32))
-            //         % self.num_pes,
-            //     team_recv_pe,
-            //     recv_pe,
-            //     send_buf_slice,
-            //         unsafe {
-            //             self.barrier_buf[i - 1]
-            //                 .as_mut_slice()
-            //                 .expect("Data should exist on PE")
-            //         }
-            // );
-            // self.print_bar();
+            println!(
+                "[{:?}][{:?}, {:?}] round: {:?} i: {:?} teamsend_pe: {:?} team_recv_pe: {:?} recv_pe: {:?} id: {:?} buf {:?}",
+                std::thread::current().id(),
+                self.my_pe,
+                my_index,
+                round,
+                i,
+                (my_index + i * (self.n + 1).pow(round as u32))
+                    % self.num_pes,
+                team_recv_pe,
+                recv_pe,
+                send_buf_slice,
+                    unsafe {
+                        self.barrier_buf[i - 1]
+                            .as_mut_slice()
+                            .expect("Data should exist on PE")
+                    }
+            );
+            self.print_bar();
             *s = Instant::now();
         }
     }
diff --git a/src/darc.rs b/src/darc.rs
index 8ce00a11..797291c3 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -36,7 +36,7 @@
 //!     let world = LamellarWorldBuilder::new().build();
 //!     let my_pe = world.my_pe();
 //!     let num_pes = world.num_pes();
-//!     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).unwrap();
+//!     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).block().unwrap();
 //!     let _ = world.exec_am_all(DarcAm {counter: darc_counter.clone()}).spawn();
 //!     darc_counter.fetch_add(my_pe, Ordering::SeqCst);
 //!     world.wait_all(); // wait for my active message to return
@@ -68,7 +68,7 @@ use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::lamellar_world::LAMELLAES;
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
-use crate::{IdError, LamellarEnv, LamellarTeam, TypedAmGroupResult};
+use crate::{IdError, LamellarEnv, LamellarTeam};
 
 /// prelude for the darc module
 pub mod prelude;
@@ -79,7 +79,7 @@ pub use local_rw_darc::LocalRwDarc;
 pub(crate) mod global_rw_darc;
 pub use global_rw_darc::GlobalRwDarc;
 
-use self::handle::{IntoGlobalRwDarcHandle, IntoLocalRwDarcHandle};
+use self::handle::{DarcHandle, IntoGlobalRwDarcHandle, IntoLocalRwDarcHandle};
 
 pub(crate) mod handle;
 
@@ -188,7 +188,7 @@ unsafe impl<T> Sync for DarcInner<T> {} //we cant create DarcInners without goin
 ///     let world = LamellarWorldBuilder::new().build();
 ///     let my_pe = world.my_pe();
 ///     let num_pes = world.num_pes();
-///     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).unwrap();
+///     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).block().unwrap();
 ///     let _ = world.exec_am_all(DarcAm {counter: darc_counter.clone()}).spawn();
 ///     darc_counter.fetch_add(my_pe, Ordering::SeqCst);
 ///     world.wait_all(); // wait for my active message to return
@@ -276,7 +276,7 @@ impl<'de, T: 'static> Deserialize<'de> for Darc<T> {
 ///     let world = LamellarWorldBuilder::new().build();
 ///     let my_pe = world.my_pe();
 ///     let num_pes = world.num_pes();
-///     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).unwrap();
+///     let darc_counter = Darc::new(&world, AtomicUsize::new(0)).block().unwrap();
 ///     let weak = Darc::downgrade(&darc_counter);
 ///     match weak.upgrade(){
 ///         Some(counter) => {
@@ -1120,7 +1120,7 @@ fn calc_padding(addr: usize, align: usize) -> usize {
     }
 }
 
-impl<T> Darc<T> {
+impl<T: Send + Sync> Darc<T> {
     #[doc(alias = "Collective")]
     /// Constructs a new `Darc<T>` on the PEs specified by team.
     ///
@@ -1138,27 +1138,40 @@ impl<T> Darc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = Darc::new(&world,5).expect("PE in world team");
+    /// let five = Darc::new(&world,5).block().expect("PE in world team");
     /// ```
-    pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> Result<Darc<T>, IdError> {
-        Darc::try_new_with_drop(team, item, DarcMode::Darc, None)
+    pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> DarcHandle<T> {
+        let team = team.into().team.clone();
+        DarcHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(Darc::async_try_new_with_drop(
+                team,
+                item,
+                DarcMode::Darc,
+                None,
+            )),
+        }
     }
+    // pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> Result<Darc<T>, IdError> {
+    //     Darc::try_new_with_drop(team, item, DarcMode::Darc, None)
+    // }
 
-    pub(crate) async fn async_try_new<U: Into<IntoLamellarTeam>>(
-        team: U,
-        item: T,
-        state: DarcMode,
-    ) -> Result<Darc<T>, IdError> {
-        Darc::async_try_new_with_drop(team, item, state, None).await
-    }
+    // pub(crate) async fn async_try_new<U: Into<IntoLamellarTeam>>(
+    //     team: U,
+    //     item: T,
+    //     state: DarcMode,
+    // ) -> Result<Darc<T>, IdError> {
+    //     Darc::async_try_new_with_drop(team, item, state, None).await
+    // }
 
-    pub(crate) fn try_new<U: Into<IntoLamellarTeam>>(
-        team: U,
-        item: T,
-        state: DarcMode,
-    ) -> Result<Darc<T>, IdError> {
-        Darc::try_new_with_drop(team, item, state, None)
-    }
+    // pub(crate) fn try_new<U: Into<IntoLamellarTeam>>(
+    //     team: U,
+    //     item: T,
+    //     state: DarcMode,
+    // ) -> Result<Darc<T>, IdError> {
+    //     Darc::try_new_with_drop(team, item, state, None)
+    // }
 
     pub(crate) async fn async_try_new_with_drop<U: Into<IntoLamellarTeam>>(
         team: U,
@@ -1302,146 +1315,146 @@ impl<T> Darc<T> {
         Ok(d)
     }
 
-    pub(crate) fn try_new_with_drop<U: Into<IntoLamellarTeam>>(
-        team: U,
-        item: T,
-        state: DarcMode,
-        drop: Option<fn(&mut T) -> bool>,
-    ) -> Result<Darc<T>, IdError> {
-        let team_rt = team.into().team.clone();
-        let my_pe = team_rt.team_pe?;
-
-        let alloc = if team_rt.num_pes == team_rt.num_world_pes {
-            AllocationType::Global
-        } else {
-            AllocationType::Sub(team_rt.get_pes())
-        };
-
-        //The DarcInner data structure
-        let mut size = std::mem::size_of::<DarcInner<T>>();
-
-        // Ref Cnt Array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        // total ref cnt array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let total_ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        // mode array
-        let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
-        let mode_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
-
-        //mode ref cnt array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let mode_ref_cnt_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-
-        //mode_barrier array
-        let padding = calc_padding(size, std::mem::align_of::<usize>());
-        let mode_barrier_offset = size + padding;
-        size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
-        // println!("creating new darc");
-
-        team_rt.tasking_barrier();
-        // println!("creating new darc after barrier");
-        let addr = team_rt
-            .lamellae
-            .alloc(size, alloc, std::mem::align_of::<DarcInner<T>>())
-            .expect("out of memory");
-        // let temp_team = team_rt.clone();
-        // team_rt.print_cnt();
-        let team_ptr = unsafe {
-            let pinned_team = Pin::into_inner_unchecked(team_rt.clone());
-            Arc::into_raw(pinned_team)
-        };
-        // team_rt.print_cnt();
-        let am_counters = Arc::new(AMCounters::new());
-        let am_counters_ptr = Arc::into_raw(am_counters);
-        let barrier = Box::new(Barrier::new(
-            team_rt.world_pe,
-            team_rt.num_world_pes,
-            team_rt.lamellae.clone(),
-            team_rt.arch.clone(),
-            team_rt.scheduler.clone(),
-            team_rt.panic.clone(),
-        ));
-        let barrier_ptr = Box::into_raw(barrier);
-        let darc_temp = DarcInner {
-            id: DARC_ID.fetch_add(1, Ordering::Relaxed),
-            my_pe: my_pe,
-            num_pes: team_rt.num_pes,
-            local_cnt: AtomicUsize::new(1),
-            total_local_cnt: AtomicUsize::new(1),
-            weak_local_cnt: AtomicUsize::new(0),
-            dist_cnt: AtomicUsize::new(0),
-            total_dist_cnt: AtomicUsize::new(0),
-            // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
-            // total_ref_cnt_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            // mode_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            // mode_ref_cnt_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
-            // mode_barrier_addr: addr
-            //     + std::mem::size_of::<DarcInner<T>>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>()
-            //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
-            //     + team_rt.num_pes * std::mem::size_of::<usize>(),
-            ref_cnt_addr: addr + ref_cnt_offset,
-            total_ref_cnt_addr: addr + total_ref_cnt_offset,
-            mode_addr: addr + mode_offset,
-            mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
-            mode_barrier_addr: addr + mode_barrier_offset,
-            barrier: barrier_ptr,
-            // mode_barrier_rounds: num_rounds,
-            am_counters: am_counters_ptr,
-            team: team_ptr, //&team_rt, //Arc::into_raw(temp_team),
-            item: Box::into_raw(Box::new(item)),
-            drop: drop,
-            valid: AtomicBool::new(true),
-        };
-        unsafe {
-            std::ptr::copy_nonoverlapping(&darc_temp, addr as *mut DarcInner<T>, 1);
-        }
-        // println!("Darc Inner Item Addr: {:?}", darc_temp.item);
+    // pub(crate) fn try_new_with_drop<U: Into<IntoLamellarTeam>>(
+    //     team: U,
+    //     item: T,
+    //     state: DarcMode,
+    //     drop: Option<fn(&mut T) -> bool>,
+    // ) -> Result<Darc<T>, IdError> {
+    //     let team_rt = team.into().team.clone();
+    //     let my_pe = team_rt.team_pe?;
+
+    //     let alloc = if team_rt.num_pes == team_rt.num_world_pes {
+    //         AllocationType::Global
+    //     } else {
+    //         AllocationType::Sub(team_rt.get_pes())
+    //     };
 
-        let d = Darc {
-            inner: addr as *mut DarcInner<T>,
-            src_pe: my_pe,
-        };
-        for elem in d.ref_cnts_as_mut_slice() {
-            *elem = 0;
-        }
-        for elem in d.mode_as_mut_slice() {
-            *elem = state;
-        }
-        for elem in d.mode_barrier_as_mut_slice() {
-            *elem = 0;
-        }
-        for elem in d.mode_ref_cnt_as_mut_slice() {
-            *elem = 0;
-        }
-        // println!(
-        //     " [{:?}] created new darc , next_id: {:?}",
-        //     std::thread::current().id(),
-        //     DARC_ID.load(Ordering::Relaxed)
-        // );
-        // d.print();
-        team_rt.tasking_barrier();
-        // team_rt.print_cnt();
-        Ok(d)
-    }
+    //     //The DarcInner data structure
+    //     let mut size = std::mem::size_of::<DarcInner<T>>();
+
+    //     // Ref Cnt Array
+    //     let padding = calc_padding(size, std::mem::align_of::<usize>());
+    //     let ref_cnt_offset = size + padding;
+    //     size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+    //     // total ref cnt array
+    //     let padding = calc_padding(size, std::mem::align_of::<usize>());
+    //     let total_ref_cnt_offset = size + padding;
+    //     size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+    //     // mode array
+    //     let padding = calc_padding(size, std::mem::align_of::<DarcMode>());
+    //     let mode_offset = size + padding;
+    //     size += padding + team_rt.num_pes * std::mem::size_of::<DarcMode>();
+
+    //     //mode ref cnt array
+    //     let padding = calc_padding(size, std::mem::align_of::<usize>());
+    //     let mode_ref_cnt_offset = size + padding;
+    //     size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+
+    //     //mode_barrier array
+    //     let padding = calc_padding(size, std::mem::align_of::<usize>());
+    //     let mode_barrier_offset = size + padding;
+    //     size += padding + team_rt.num_pes * std::mem::size_of::<usize>();
+    //     // println!("creating new darc");
+
+    //     team_rt.tasking_barrier();
+    //     // println!("creating new darc after barrier");
+    //     let addr = team_rt
+    //         .lamellae
+    //         .alloc(size, alloc, std::mem::align_of::<DarcInner<T>>())
+    //         .expect("out of memory");
+    //     // let temp_team = team_rt.clone();
+    //     // team_rt.print_cnt();
+    //     let team_ptr = unsafe {
+    //         let pinned_team = Pin::into_inner_unchecked(team_rt.clone());
+    //         Arc::into_raw(pinned_team)
+    //     };
+    //     // team_rt.print_cnt();
+    //     let am_counters = Arc::new(AMCounters::new());
+    //     let am_counters_ptr = Arc::into_raw(am_counters);
+    //     let barrier = Box::new(Barrier::new(
+    //         team_rt.world_pe,
+    //         team_rt.num_world_pes,
+    //         team_rt.lamellae.clone(),
+    //         team_rt.arch.clone(),
+    //         team_rt.scheduler.clone(),
+    //         team_rt.panic.clone(),
+    //     ));
+    //     let barrier_ptr = Box::into_raw(barrier);
+    //     let darc_temp = DarcInner {
+    //         id: DARC_ID.fetch_add(1, Ordering::Relaxed),
+    //         my_pe: my_pe,
+    //         num_pes: team_rt.num_pes,
+    //         local_cnt: AtomicUsize::new(1),
+    //         total_local_cnt: AtomicUsize::new(1),
+    //         weak_local_cnt: AtomicUsize::new(0),
+    //         dist_cnt: AtomicUsize::new(0),
+    //         total_dist_cnt: AtomicUsize::new(0),
+    //         // ref_cnt_addr: addr + std::mem::size_of::<DarcInner<T>>(),
+    //         // total_ref_cnt_addr: addr
+    //         //     + std::mem::size_of::<DarcInner<T>>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+    //         // mode_addr: addr
+    //         //     + std::mem::size_of::<DarcInner<T>>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+    //         // mode_ref_cnt_addr: addr
+    //         //     + std::mem::size_of::<DarcInner<T>>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<DarcMode>(),
+    //         // mode_barrier_addr: addr
+    //         //     + std::mem::size_of::<DarcInner<T>>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<DarcMode>()
+    //         //     + team_rt.num_pes * std::mem::size_of::<usize>(),
+    //         ref_cnt_addr: addr + ref_cnt_offset,
+    //         total_ref_cnt_addr: addr + total_ref_cnt_offset,
+    //         mode_addr: addr + mode_offset,
+    //         mode_ref_cnt_addr: addr + mode_ref_cnt_offset,
+    //         mode_barrier_addr: addr + mode_barrier_offset,
+    //         barrier: barrier_ptr,
+    //         // mode_barrier_rounds: num_rounds,
+    //         am_counters: am_counters_ptr,
+    //         team: team_ptr, //&team_rt, //Arc::into_raw(temp_team),
+    //         item: Box::into_raw(Box::new(item)),
+    //         drop: drop,
+    //         valid: AtomicBool::new(true),
+    //     };
+    //     unsafe {
+    //         std::ptr::copy_nonoverlapping(&darc_temp, addr as *mut DarcInner<T>, 1);
+    //     }
+    //     // println!("Darc Inner Item Addr: {:?}", darc_temp.item);
+
+    //     let d = Darc {
+    //         inner: addr as *mut DarcInner<T>,
+    //         src_pe: my_pe,
+    //     };
+    //     for elem in d.ref_cnts_as_mut_slice() {
+    //         *elem = 0;
+    //     }
+    //     for elem in d.mode_as_mut_slice() {
+    //         *elem = state;
+    //     }
+    //     for elem in d.mode_barrier_as_mut_slice() {
+    //         *elem = 0;
+    //     }
+    //     for elem in d.mode_ref_cnt_as_mut_slice() {
+    //         *elem = 0;
+    //     }
+    //     // println!(
+    //     //     " [{:?}] created new darc , next_id: {:?}",
+    //     //     std::thread::current().id(),
+    //     //     DARC_ID.load(Ordering::Relaxed)
+    //     // );
+    //     // d.print();
+    //     team_rt.tasking_barrier();
+    //     // team_rt.print_cnt();
+    //     Ok(d)
+    // }
 
     pub(crate) async fn block_on_outstanding(self, state: DarcMode, extra_cnt: usize) {
         let wrapped = WrappedInner {
@@ -1468,8 +1481,11 @@ impl<T> Darc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = Darc::new(&world,5).expect("PE in world team");
-    /// let five_as_localdarc = world.block_on(async move {five.into_localrw().await});
+    /// let five_handle = Darc::new(&world,5);
+    /// let five_as_localdarc = world.block_on(async move {
+    ///     let five = five_handle.await;
+    ///     five.into_localrw().await
+    /// });
     /// ```
     pub fn into_localrw(self) -> IntoLocalRwDarcHandle<T> {
         let wrapped_inner = WrappedInner {
@@ -1504,7 +1520,7 @@ impl<T> Darc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = Darc::new(&world,5).expect("PE in world team");
+    /// let five = Darc::new(&world,5).block().expect("PE in world team");
     /// let five_as_globaldarc = five.into_globalrw().block();
     /// ```
     pub fn into_globalrw(self) -> IntoGlobalRwDarcHandle<T> {
diff --git a/src/darc/global_rw_darc.rs b/src/darc/global_rw_darc.rs
index e39d28da..ae228c5c 100644
--- a/src/darc/global_rw_darc.rs
+++ b/src/darc/global_rw_darc.rs
@@ -9,13 +9,12 @@ use std::sync::Arc;
 use crate::active_messaging::RemotePtr;
 use crate::darc::{Darc, DarcInner, DarcMode, WrappedInner, __NetworkDarc};
 use crate::lamellae::LamellaeRDMA;
-use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
 use super::handle::{
-    GlobalRwDarcCollectiveWriteHandle, GlobalRwDarcReadHandle, GlobalRwDarcWriteHandle,
-    IntoDarcHandle, IntoLocalRwDarcHandle,
+    GlobalRwDarcCollectiveWriteHandle, GlobalRwDarcHandle, GlobalRwDarcReadHandle,
+    GlobalRwDarcWriteHandle, IntoDarcHandle, IntoLocalRwDarcHandle,
 };
 
 #[derive(serde::Serialize, serde::Deserialize, Debug)]
@@ -39,7 +38,7 @@ pub(crate) struct DistRwLock<T> {
 }
 
 unsafe impl<T: Send> Send for DistRwLock<T> {}
-unsafe impl<T: Sync> Sync for DistRwLock<T> {}
+unsafe impl<T: Send> Sync for DistRwLock<T> {}
 
 /// # Safety
 ///
@@ -318,7 +317,7 @@ impl<T> Clone for GlobalRwDarcReadGuard<T> {
     }
 }
 
-impl<T> Drop for GlobalRwDarcReadGuard<T> {
+impl<T: 'static> Drop for GlobalRwDarcReadGuard<T> {
     fn drop(&mut self) {
         // println!("dropping global rwdarc read guard");
         if self.local_cnt.fetch_sub(1, Ordering::SeqCst) == 1 {
@@ -328,7 +327,7 @@ impl<T> Drop for GlobalRwDarcReadGuard<T> {
                 0,
                 inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
             );
-            let mut am = team.spawn_am_pe_tg(
+            let _am = team.spawn_am_pe_tg(
                 0,
                 UnlockAm {
                     rwlock_addr: remote_rwlock_addr,
@@ -370,7 +369,7 @@ impl<T> DerefMut for GlobalRwDarcWriteGuard<T> {
     }
 }
 
-impl<T> Drop for GlobalRwDarcWriteGuard<T> {
+impl<T: 'static> Drop for GlobalRwDarcWriteGuard<T> {
     fn drop(&mut self) {
         // println!("dropping write guard");
         let inner = self.darc.inner();
@@ -379,7 +378,7 @@ impl<T> Drop for GlobalRwDarcWriteGuard<T> {
             0,
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        let mut am = team.spawn_am_pe_tg(
+        let _am = team.spawn_am_pe_tg(
             0,
             UnlockAm {
                 rwlock_addr: remote_rwlock_addr,
@@ -420,7 +419,7 @@ impl<T> DerefMut for GlobalRwDarcCollectiveWriteGuard<T> {
     }
 }
 
-impl<T> Drop for GlobalRwDarcCollectiveWriteGuard<T> {
+impl<T: 'static> Drop for GlobalRwDarcCollectiveWriteGuard<T> {
     fn drop(&mut self) {
         // println!("dropping collective write guard");
         let inner = self.darc.inner();
@@ -429,7 +428,7 @@ impl<T> Drop for GlobalRwDarcCollectiveWriteGuard<T> {
             0,
             inner as *const DarcInner<DistRwLock<T>> as *const () as usize,
         );
-        let mut am = team.spawn_am_pe_tg(
+        let _am = team.spawn_am_pe_tg(
             0,
             UnlockAm {
                 rwlock_addr: remote_rwlock_addr,
@@ -592,7 +591,7 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     /// let guard = counter.read().block();
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
@@ -658,7 +657,7 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     /// let mut guard = counter.write().block();  //block until we get the write lock
     /// *guard += my_pe;
@@ -708,7 +707,7 @@ impl<T> GlobalRwDarc<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
     ///
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let mut guard = counter.collective_write().block(); // this will block until all PEs have acquired the lock
     /// *guard += my_pe;
     ///```
@@ -738,7 +737,7 @@ impl<T> GlobalRwDarc<T> {
     }
 }
 
-impl<T> GlobalRwDarc<T> {
+impl<T: Send> GlobalRwDarc<T> {
     #[doc(alias = "Collective")]
     /// Constructs a new `GlobalRwDarc<T>` on the PEs specified by team.
     ///
@@ -756,20 +755,29 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
     /// ```
-    pub fn new<U: Clone + Into<IntoLamellarTeam>>(
-        team: U,
-        item: T,
-    ) -> Result<GlobalRwDarc<T>, IdError> {
-        Ok(GlobalRwDarc {
-            darc: Darc::try_new_with_drop(
+    pub fn new<U: Clone + Into<IntoLamellarTeam>>(team: U, item: T) -> GlobalRwDarcHandle<T> {
+        // Ok(GlobalRwDarc {
+        //     darc: Darc::try_new_with_drop(
+        //         team.clone(),
+        //         DistRwLock::new(item, team),
+        //         DarcMode::GlobalRw,
+        //         Some(GlobalRwDarc::drop),
+        //     )?,
+        // })
+        let team = team.into().team.clone();
+        let locked_item = DistRwLock::new(item, team.clone());
+        GlobalRwDarcHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(Darc::async_try_new_with_drop(
                 team.clone(),
-                DistRwLock::new(item, team),
+                locked_item,
                 DarcMode::GlobalRw,
                 Some(GlobalRwDarc::drop),
-            )?,
-        })
+            )),
+        }
     }
     pub(crate) fn drop(lock: &mut DistRwLock<T>) -> bool {
         lock.dirty_num_locks() == 0
@@ -806,7 +814,7 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_darc = five.into_darc().block();
     /// ```
     pub fn into_darc(self) -> IntoDarcHandle<T> {
@@ -850,7 +858,7 @@ impl<T> GlobalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_localdarc = world.block_on(async move {five.into_localrw().await});
     /// ```
     pub fn into_localrw(self) -> IntoLocalRwDarcHandle<T> {
@@ -886,7 +894,7 @@ impl<T> Clone for GlobalRwDarc<T> {
     }
 }
 
-impl<T: fmt::Display> fmt::Display for GlobalRwDarc<T> {
+impl<T: fmt::Display + Send> fmt::Display for GlobalRwDarc<T> {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         unsafe { fmt::Display::fmt(&self.inner().item().data.get().as_ref().unwrap(), f) }
     }
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
index 292a6dc2..822b7415 100644
--- a/src/darc/handle.rs
+++ b/src/darc/handle.rs
@@ -8,7 +8,7 @@ use crate::darc::local_rw_darc::{LocalRwDarc, LocalRwDarcReadGuard};
 use crate::lamellar_request::LamellarRequest;
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
-use crate::{AmHandle, Darc};
+use crate::{AmHandle, Darc, IdError};
 use crate::{GlobalRwDarc, LamellarTeamRT};
 
 use async_lock::{RwLock, RwLockReadGuardArc, RwLockWriteGuardArc};
@@ -61,7 +61,7 @@ enum State<T> {
 ///
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let counter = LocalRwDarc::new(&world, 0).unwrap();
+/// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
 /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
 /// let handle = counter.read();
 /// let guard = handle.block(); //block until we get the read lock
@@ -109,7 +109,7 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.read();
     /// let guard = handle.block(); //block until we get the read lock
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
@@ -132,7 +132,7 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
             .clone()
             .block_on(async move { inner_darc.read_arc().await });
         LocalRwDarcReadGuard {
-            darc: self.darc.clone(),
+            _darc: self.darc.clone(),
             lock: guard,
         }
     }
@@ -148,7 +148,7 @@ impl<T: Sync + Send> LocalRwDarcReadHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.read();
     /// let task = handle.spawn(); //initiate the operation
     /// // do other work
@@ -179,7 +179,7 @@ impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
             StateProj::TryingRead(lock) => {
                 let guard = ready!(lock.poll(cx));
                 Poll::Ready(LocalRwDarcReadGuard {
-                    darc: this.darc.clone(),
+                    _darc: this.darc.clone(),
                     lock: guard,
                 })
             }
@@ -220,7 +220,7 @@ impl<T: Sync + Send> Future for LocalRwDarcReadHandle<T> {
 ///
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let counter = LocalRwDarc::new(&world, 0).unwrap();
+/// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
 /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
 /// let handle = counter.write();
 /// let mut guard = handle.block(); //block until we get the write lock
@@ -267,7 +267,7 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.write();
     /// let mut guard = handle.block(); //block until we get the write lock
     /// *guard += my_pe;
@@ -289,7 +289,7 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
             .clone()
             .block_on(async move { inner_darc.write_arc().await });
         LocalRwDarcWriteGuard {
-            darc: self.darc.clone(),
+            _darc: self.darc.clone(),
             lock: guard,
         }
     }
@@ -305,7 +305,7 @@ impl<T: Sync + Send> LocalRwDarcWriteHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.write();
     /// let task = handle.spawn(); //initiate the operation
     /// // do other work
@@ -335,7 +335,7 @@ impl<T: Sync + Send> Future for LocalRwDarcWriteHandle<T> {
             StateProj::TryingWrite(lock) => {
                 let guard = ready!(lock.poll(cx));
                 Poll::Ready(LocalRwDarcWriteGuard {
-                    darc: this.darc.clone(),
+                    _darc: this.darc.clone(),
                     lock: guard,
                 })
             }
@@ -376,7 +376,7 @@ impl<T: Sync + Send> Future for LocalRwDarcWriteHandle<T> {
 ///
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+/// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
 /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
 /// let handle = counter.read();
 /// let guard = handle.block(); //block until we get the write lock
@@ -403,7 +403,7 @@ impl<T: Sync + Send> GlobalRwDarcReadHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.read();
     /// let guard = handle.block(); //block until we get the write lock
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
@@ -434,7 +434,7 @@ impl<T: Sync + Send> GlobalRwDarcReadHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.read();
     /// let task = handle.spawn(); //initiate the operation
     /// // do other work
@@ -492,7 +492,7 @@ impl<T: Sync + Send> Future for GlobalRwDarcReadHandle<T> {
 ///
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
-/// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+/// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
 /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
 /// let handle = counter.write();
 /// let mut guard = handle.block(); //block until we get the write lock
@@ -519,7 +519,7 @@ impl<T: Sync + Send> GlobalRwDarcWriteHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.write();
     /// let mut guard = handle.block(); //block until we get the write lock
     /// *guard += my_pe;
@@ -549,7 +549,7 @@ impl<T: Sync + Send> GlobalRwDarcWriteHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.write();
     /// let task = handle.spawn(); //initiate the operation
     /// // do other work
@@ -592,7 +592,7 @@ impl<T: Sync + Send> Future for GlobalRwDarcWriteHandle<T> {
 /// let world = LamellarWorldBuilder::new().build();
 /// let my_pe = world.my_pe();
 ///
-/// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+/// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
 /// let handle = counter.collective_write();
 /// let mut guard = handle.block(); // this will block until all PEs have acquired the lock
 /// *guard += my_pe;
@@ -615,7 +615,7 @@ impl<T: Sync + Send> GlobalRwDarcCollectiveWriteHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.collective_write();
     /// let mut guard = handle.block(); //block until we get the write lock
     /// *guard += my_pe;
@@ -645,7 +645,7 @@ impl<T: Sync + Send> GlobalRwDarcCollectiveWriteHandle<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = GlobalRwDarc::new(&world, 0).unwrap();
+    /// let counter = GlobalRwDarc::new(&world, 0).block().unwrap();
     /// let handle = counter.collective_write();
     /// let task = handle.spawn();//initiate the operation
     /// // do other work
@@ -757,7 +757,7 @@ impl<T: 'static> OrigDarc<T> {
 ///
 /// let world = LamellarWorldBuilder::new().build();
 ///
-/// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+/// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
 /// let five_as_darc = five.into_darc().block();
 /// /* alternatively something like the following is valid as well
 /// let five_as_darc = world.block_on(async move{
@@ -790,7 +790,7 @@ impl<T: Sync + Send> IntoDarcHandle<T> {
     /// use lamellar::darc::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_darc = five.into_darc().block();
     pub fn block(mut self) -> Darc<T> {
         self.launched = true;
@@ -802,8 +802,7 @@ impl<T: Sync + Send> IntoDarcHandle<T> {
         self.team.clone().block_on(self)
     }
 
-    /// This method will spawn the associated active message to capture the lock on the work queue,
-    /// initiating the operation.
+    /// This method will spawn the conversion  into the Darc on the work queue.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     /// # Examples
@@ -811,7 +810,7 @@ impl<T: Sync + Send> IntoDarcHandle<T> {
     /// use lamellar::darc::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_darc_task = five.into_darc().spawn();
     /// let five_as_darc = five_as_darc_task.block();
     /// ```
@@ -859,7 +858,7 @@ impl<T: Sync + Send> Future for IntoDarcHandle<T> {
 ///
 /// let world = LamellarWorldBuilder::new().build();
 ///
-/// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+/// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
 /// let five_as_localrw = five.into_localrw().block();
 /// /* alternatively something like the following is valid as well
 /// let five_as_localrw = world.block_on(async move{
@@ -892,7 +891,7 @@ impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
     /// use lamellar::darc::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_localrw = five.into_localrw().block();
     pub fn block(mut self) -> LocalRwDarc<T> {
         self.launched = true;
@@ -905,8 +904,7 @@ impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
         self.team.clone().block_on(self)
     }
 
-    /// This method will spawn the associated active message to capture the lock on the work queue,
-    /// initiating the operation.
+    /// /// This method will spawn the conversion into the LocalRwDarc on the work queue.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     /// # Examples
@@ -914,7 +912,7 @@ impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
     /// use lamellar::darc::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let five = GlobalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_localrw_task = five.into_localrw().spawn();
     /// let five_as_localrw = five_as_localrw_task.block();
     /// ```
@@ -963,7 +961,7 @@ impl<T: Sync + Send> Future for IntoLocalRwDarcHandle<T> {
 ///
 /// let world = LamellarWorldBuilder::new().build();
 ///
-/// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+/// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
 /// let five_as_globalrw = five.into_globalrw().block();
 /// /* alternatively something like the following is valid as well
 /// let five_as_globalrw = world.block_on(async move{
@@ -996,7 +994,7 @@ impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
     /// use lamellar::darc::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_globalrw = five.into_globalrw().block();
     pub fn block(mut self) -> GlobalRwDarc<T> {
         self.launched = true;
@@ -1008,8 +1006,7 @@ impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
         self.team.clone().block_on(self)
     }
 
-    /// This method will spawn the associated active message to capture the lock on the work queue,
-    /// initiating the operation.
+    /// This method will spawn the conversion into the GlobalRwDarc on the work queue.
     ///
     /// This function returns a handle that can be used to wait for the operation to complete
     /// /// # Examples
@@ -1018,7 +1015,7 @@ impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
     /// use lamellar::darc::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_globalrw_task = five.into_globalrw().spawn();
     /// let five_as_globalrw = five_as_globalrw_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
@@ -1049,3 +1046,242 @@ impl<T: Sync + Send> Future for IntoGlobalRwDarcHandle<T> {
         Poll::Ready(GlobalRwDarc { darc })
     }
 }
+
+#[must_use = " Darc 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [Darc].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Darc's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::darc::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let five = Darc::new(&world,5).block().expect("PE in world team");
+/// ```
+pub struct DarcHandle<T: 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = Result<Darc<T>, IdError>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for DarcHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a DarcHandle").print();
+        }
+    }
+}
+
+impl<T: Sync + Send> DarcHandle<T> {
+    /// Used to drive creation of a new darc
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = Darc::new(&world,5).block().expect("PE in world team");
+    pub fn block(mut self) -> Result<Darc<T>, IdError> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall("DarcHandle::block", "<handle>.spawn() or<handle>.await")
+            .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the Darc on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five_task = Darc::new(&world,5).spawn();
+    /// // do some other work
+    /// let five = five_task.block().expect("PE in world team");
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<Result<Darc<T>, IdError>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Sync + Send> Future for DarcHandle<T> {
+    type Output = Result<Darc<T>, IdError>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        let darc = ready!(this.creation_future.as_mut().poll(cx));
+        Poll::Ready(darc)
+    }
+}
+
+#[must_use = " LocalRwDarc 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [LocalRwDarc].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the LocalRwDarc's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `LocalRwDarc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::darc::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
+/// ```
+pub struct LocalRwDarcHandle<T: 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future:
+        Pin<Box<dyn Future<Output = Result<Darc<Arc<RwLock<T>>>, IdError>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for LocalRwDarcHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a LocalRwDarc").print();
+        }
+    }
+}
+
+impl<T: Sync + Send> LocalRwDarcHandle<T> {
+    /// Used to drive creation of a new darc
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
+    pub fn block(mut self) -> Result<LocalRwDarc<T>, IdError> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall("DarcHandle::block", "<handle>.spawn() or<handle>.await")
+            .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the LocalRwDarc on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five_task = LocalRwDarc::new(&world,5).spawn();
+    /// // do some other work
+    /// let five = five_task.block().expect("PE in world team");
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<Result<LocalRwDarc<T>, IdError>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Sync + Send> Future for LocalRwDarcHandle<T> {
+    type Output = Result<LocalRwDarc<T>, IdError>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        let darc = ready!(this.creation_future.as_mut().poll(cx))?;
+        Poll::Ready(Ok(LocalRwDarc { darc }))
+    }
+}
+
+#[must_use = " GlobalRwDarc 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [GlobalRwDarc].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the GlobalRwDarc's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `GlobalRwDarc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::darc::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
+/// ```
+pub struct GlobalRwDarcHandle<T: 'static> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future:
+        Pin<Box<dyn Future<Output = Result<Darc<DistRwLock<T>>, IdError>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: 'static> PinnedDrop for GlobalRwDarcHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a GlobalRwDarc").print();
+        }
+    }
+}
+
+impl<T: Sync + Send> GlobalRwDarcHandle<T> {
+    /// Used to drive creation of a new darc
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five = GlobalRwDarc::new(&world,5).block().expect("PE in world team");
+    pub fn block(mut self) -> Result<GlobalRwDarc<T>, IdError> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall("DarcHandle::block", "<handle>.spawn() or<handle>.await")
+            .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the GlobalRwDarc on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::darc::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let five_task = GlobalRwDarc::new(&world,5).spawn();
+    /// // do some other work
+    /// let five = five_task.block().expect("PE in world team");
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<Result<GlobalRwDarc<T>, IdError>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Sync + Send> Future for GlobalRwDarcHandle<T> {
+    type Output = Result<GlobalRwDarc<T>, IdError>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        let darc = ready!(this.creation_future.as_mut().poll(cx))?;
+        Poll::Ready(Ok(GlobalRwDarc { darc }))
+    }
+}
diff --git a/src/darc/local_rw_darc.rs b/src/darc/local_rw_darc.rs
index 6d4689f8..ec5666a3 100644
--- a/src/darc/local_rw_darc.rs
+++ b/src/darc/local_rw_darc.rs
@@ -11,13 +11,14 @@ use crate::lamellae::LamellaeRDMA;
 use crate::lamellar_team::IntoLamellarTeam;
 use crate::{IdError, LamellarEnv, LamellarTeam};
 
+use super::handle::LocalRwDarcHandle;
 pub(crate) use super::handle::{
     IntoDarcHandle, IntoGlobalRwDarcHandle, LocalRwDarcReadHandle, LocalRwDarcWriteHandle,
 };
 
 #[derive(Debug)]
 pub struct LocalRwDarcReadGuard<T: 'static> {
-    pub(crate) darc: LocalRwDarc<T>,
+    pub(crate) _darc: LocalRwDarc<T>,
     pub(crate) lock: RwLockReadGuardArc<T>,
 }
 
@@ -46,7 +47,7 @@ impl<T> std::ops::Deref for LocalRwDarcReadGuard<T> {
 
 #[derive(Debug)]
 pub struct LocalRwDarcWriteGuard<T: 'static> {
-    pub(crate) darc: LocalRwDarc<T>,
+    pub(crate) _darc: LocalRwDarc<T>,
     pub(crate) lock: RwLockWriteGuardArc<T>,
 }
 
@@ -221,7 +222,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// //-------------
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
     /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     /// let guard = counter.read().block(); //we can also explicitly block on the lock in a non async context
     /// println!("the current counter value on pe {} main thread = {}",my_pe,*guard);
@@ -263,7 +264,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     /// //-------------
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let counter = LocalRwDarc::new(&world, 0).unwrap();
+    /// let counter = LocalRwDarc::new(&world, 0).block().unwrap();
     /// let _ = world.exec_am_all(DarcAm {counter: counter.clone()}).spawn();
     /// let mut  guard = counter.write().block(); //we can also explicitly block on the lock in a non async context
     /// *guard += my_pe;
@@ -272,9 +273,7 @@ impl<T: Sync + Send> LocalRwDarc<T> {
     pub fn write(&self) -> LocalRwDarcWriteHandle<T> {
         LocalRwDarcWriteHandle::new(self.clone())
     }
-}
 
-impl<T> LocalRwDarc<T> {
     #[doc(alias = "Collective")]
     /// Constructs a new `LocalRwDarc<T>` on the PEs specified by team.
     ///
@@ -292,12 +291,23 @@ impl<T> LocalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
     /// ```
-    pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> Result<LocalRwDarc<T>, IdError> {
-        Ok(LocalRwDarc {
-            darc: Darc::try_new(team, Arc::new(RwLock::new(item)), DarcMode::LocalRw)?,
-        })
+    pub fn new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> LocalRwDarcHandle<T> {
+        // Ok(LocalRwDarc {
+        //     darc: Darc::try_new(team, Arc::new(RwLock::new(item)), DarcMode::LocalRw)?,
+        // })
+        let team = team.into().team.clone();
+        LocalRwDarcHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(Darc::async_try_new_with_drop(
+                team,
+                Arc::new(RwLock::new(item)),
+                DarcMode::LocalRw,
+                None,
+            )),
+        }
     }
 
     // pub(crate) fn try_new<U: Into<IntoLamellarTeam>>(team: U, item: T) -> Result<LocalRwDarc<T>, IdError> {
@@ -328,7 +338,7 @@ impl<T> LocalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_globaldarc = world.block_on(async move {five.into_globalrw().await});
     /// ```
     pub fn into_globalrw(self) -> IntoGlobalRwDarcHandle<T> {
@@ -369,7 +379,7 @@ impl<T: Send + Sync> LocalRwDarc<T> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let five = LocalRwDarc::new(&world,5).expect("PE in world team");
+    /// let five = LocalRwDarc::new(&world,5).block().expect("PE in world team");
     /// let five_as_darc = five.into_darc().block();
     /// ```
     pub fn into_darc(self) -> IntoDarcHandle<T> {
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 9b24a24b..e1446d75 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -7,6 +7,7 @@ use crate::lamellar_arch::{GlobalArch, IdError, LamellarArch, LamellarArchEnum,
 use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_request::*;
 use crate::lamellar_world::LamellarWorld;
+use crate::memregion::handle::SharedMemoryRegionHandle;
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, LamellarMemoryRegion,
     MemoryRegion, RemoteMemoryRegion,
@@ -82,7 +83,7 @@ use std::marker::PhantomData;
 /// let req = even_pes.exec_am_all(MyAm{world_pe,team_pe});
 /// let result = even_pes.block_on(req);
 /// // we can also create a distributed array so that its data only resides on the members of the team.
-/// let array: AtomicArray<usize> = AtomicArray::new(&even_pes, 100,Distribution::Block);
+/// let array: AtomicArray<usize> = AtomicArray::new(&even_pes, 100,Distribution::Block).block();
 /// ```
 pub struct LamellarTeam {
     pub(crate) world: Option<Arc<LamellarTeam>>,
@@ -572,41 +573,45 @@ impl ActiveMessaging for Arc<LamellarTeam> {
 
 impl RemoteMemoryRegion for Arc<LamellarTeam> {
     //#[tracing::instrument(skip_all)]
-    fn alloc_shared_mem_region<T: Dist>(&self, size: usize) -> SharedMemoryRegion<T> {
+    fn alloc_shared_mem_region<T: Dist>(&self, size: usize) -> SharedMemoryRegionHandle<T> {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
-        self.team.barrier.barrier();
-        let mr: SharedMemoryRegion<T> = if self.team.num_world_pes == self.team.num_pes {
-            SharedMemoryRegion::new(size, self.team.clone(), AllocationType::Global)
+        // self.team.barrier.barrier();
+        let mr = if self.team.num_world_pes == self.team.num_pes {
+            SharedMemoryRegion::try_new(size, self.team.clone(), AllocationType::Global)
         } else {
-            SharedMemoryRegion::new(
+            SharedMemoryRegion::try_new(
                 size,
                 self.team.clone(),
                 AllocationType::Sub(self.team.arch.team_iter().collect::<Vec<usize>>()),
             )
         };
-        self.team.barrier.barrier();
+        // self.team.barrier.barrier();
         mr
     }
 
     //#[tracing::instrument(skip_all)]
-    fn alloc_one_sided_mem_region<T: Dist>(&self, size: usize) -> OneSidedMemoryRegion<T> {
+    fn alloc_one_sided_mem_region<T: Dist>(
+        &self,
+        size: usize,
+    ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
-        let mut lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
-        while let Err(_err) = lmr {
-            std::thread::yield_now();
-            // println!(
-            //     "out of Lamellar mem trying to alloc new pool {:?} {:?}",
-            //     size,
-            //     std::mem::size_of::<T>()
-            // );
-            self.team
-                .lamellae
-                .alloc_pool(size * std::mem::size_of::<T>());
-            lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
-        }
-        lmr.expect("out of memory")
+        let lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
+        // while let Err(_err) = lmr {
+        //     std::thread::yield_now();
+        //     // println!(
+        //     //     "out of Lamellar mem trying to alloc new pool {:?} {:?}",
+        //     //     size,
+        //     //     std::mem::size_of::<T>()
+        //     // );
+        //     self.team
+        //         .lamellae
+        //         .alloc_pool(size * std::mem::size_of::<T>());
+        //     lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
+        // }
+        // lmr.expect("out of memory")
+        lmr
     }
 }
 
@@ -2187,7 +2192,36 @@ impl LamellarTeamRT {
     pub fn alloc_one_sided_mem_region<T: Dist>(
         self: &Pin<Arc<LamellarTeamRT>>,
         size: usize,
-    ) -> OneSidedMemoryRegion<T> {
+    ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
+        // let lmr: OneSidedMemoryRegion<T> =
+        //     OneSidedMemoryRegion::new(size, self, self.lamellae.clone()).into();
+        // lmr
+        let  lmr = OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone());
+        // while let Err(_err) = lmr {
+        //     std::thread::yield_now();
+        //     // println!(
+        //     //     "out of Lamellar mem trying to alloc new pool {:?} {:?}",
+        //     //     size,
+        //     //     std::mem::size_of::<T>()
+        //     // );
+        //     self.lamellae.alloc_pool(size * std::mem::size_of::<T>());
+        //     lmr = OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone());
+        // }
+        // lmr.expect("out of memory")
+        lmr
+    }
+
+     /// allocate a local memory region from the asymmetric heap
+    ///
+    /// # Arguments
+    ///
+    /// * `size` - number of elements of T to allocate a memory region for -- (not size in bytes)
+    ///
+    //#[tracing::instrument(skip_all)]
+    pub(crate) fn alloc_one_sided_mem_region_or_panic<T: Dist>(
+        self: &Pin<Arc<LamellarTeamRT>>,
+        size: usize,
+    ) -> OneSidedMemoryRegion<T>{
         // let lmr: OneSidedMemoryRegion<T> =
         //     OneSidedMemoryRegion::new(size, self, self.lamellae.clone()).into();
         // lmr
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 6e27f809..c9aea9ff 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -3,8 +3,9 @@ use crate::lamellae::{create_lamellae, Backend, Lamellae, LamellaeComm, Lamellae
 use crate::lamellar_arch::LamellarArch;
 use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
+use crate::memregion::handle::SharedMemoryRegionHandle;
 use crate::memregion::{
-    one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, RemoteMemoryRegion,
+    one_sided::OneSidedMemoryRegion,  Dist, RemoteMemoryRegion,
 };
 use crate::scheduler::{create_scheduler, ExecutorType, LamellarTask};
 use crate::{active_messaging::*, config};
@@ -133,13 +134,16 @@ impl ActiveMessaging for LamellarWorld {
 
 impl RemoteMemoryRegion for LamellarWorld {
     //#[tracing::instrument(skip_all)]
-    fn alloc_shared_mem_region<T: Dist>(&self, size: usize) -> SharedMemoryRegion<T> {
+    fn alloc_shared_mem_region<T: Dist>(&self, size: usize) -> SharedMemoryRegionHandle<T> {
         self.barrier();
         self.team.alloc_shared_mem_region::<T>(size)
     }
 
     //#[tracing::instrument(skip_all)]
-    fn alloc_one_sided_mem_region<T: Dist>(&self, size: usize) -> OneSidedMemoryRegion<T> {
+    fn alloc_one_sided_mem_region<T: Dist>(
+        &self,
+        size: usize,
+    ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
         self.team.alloc_one_sided_mem_region::<T>(size)
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 96a9f9f1..8b76141a 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -134,7 +134,7 @@
 //! fn main(){
 //!     let world = lamellar::LamellarWorldBuilder::new().build();
 //!     let my_pe = world.my_pe();
-//!     let block_array = AtomicArray::<usize>::new(&world, 1000, Distribution::Block); //we also support Cyclic distribution.
+//!     let block_array = AtomicArray::<usize>::new(&world, 1000, Distribution::Block).block(); //we also support Cyclic distribution.
 //!     let _ =block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| elem.store(i)).block(); //simultaneosuly initialize array accross all PEs, each pe only updates its local data
 //!     block_array.barrier();
 //!     if my_pe == 0{
@@ -168,7 +168,7 @@
 //!     let mut world = lamellar::LamellarWorldBuilder::new().build();
 //!     let my_pe = world.my_pe();
 //!     let num_pes = world.num_pes();
-//!     let cnt = Darc::new(&world, AtomicUsize::new(0)).expect("Current PE is in world team");
+//!     let cnt = Darc::new(&world, AtomicUsize::new(0)).block().expect("Current PE is in world team");
 //!     for pe in 0..num_pes{
 //!         let _ = world.exec_am_pe(pe,DarcAm{cnt: cnt.clone()}).spawn(); // explicitly launch on each PE
 //!     }
diff --git a/src/memregion.rs b/src/memregion.rs
index af499e5e..05d47700 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -27,6 +27,9 @@ pub use shared::SharedMemoryRegion;
 pub(crate) mod one_sided;
 pub use one_sided::OneSidedMemoryRegion;
 
+pub(crate) mod handle;
+use handle::SharedMemoryRegionHandle;
+
 use enum_dispatch::enum_dispatch;
 
 /// This error occurs when you are trying to directly access data locally on a PE through a memregion handle,
@@ -1218,7 +1221,7 @@ pub trait RemoteMemoryRegion {
     fn alloc_shared_mem_region<T: Dist + std::marker::Sized>(
         &self,
         size: usize,
-    ) -> SharedMemoryRegion<T>;
+    ) -> SharedMemoryRegionHandle<T>;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Allocate a one-sided memory region from the internal lamellar heap.
@@ -1232,7 +1235,7 @@ pub trait RemoteMemoryRegion {
     fn alloc_one_sided_mem_region<T: Dist + std::marker::Sized>(
         &self,
         size: usize,
-    ) -> OneSidedMemoryRegion<T>;
+    ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error>;
 }
 
 impl<T: Dist> Drop for MemoryRegion<T> {
diff --git a/src/memregion/shared.rs b/src/memregion/shared.rs
index 39d975a7..da2662ff 100644
--- a/src/memregion/shared.rs
+++ b/src/memregion/shared.rs
@@ -85,29 +85,53 @@ impl<T: Dist> crate::active_messaging::DarcSerde for SharedMemoryRegion<T> {
 }
 
 impl<T: Dist> SharedMemoryRegion<T> {
-    pub(crate) fn new(
-        size: usize,
-        team: Pin<Arc<LamellarTeamRT>>,
-        alloc: AllocationType,
-    ) -> SharedMemoryRegion<T> {
-        SharedMemoryRegion::try_new(size, team, alloc).expect("Out of memory")
-    }
+    // pub(crate) fn new(
+    //     size: usize,
+    //     team: Pin<Arc<LamellarTeamRT>>,
+    //     alloc: AllocationType,
+    // ) -> SharedMemoryRegionHandle<T> {
+    //     SharedMemoryRegion::try_new(size, team, alloc).expect("Out of memory")
+    // }
 
     pub(crate) fn try_new(
         size: usize,
         team: Pin<Arc<LamellarTeamRT>>,
         alloc: AllocationType,
-    ) -> Result<SharedMemoryRegion<T>, anyhow::Error> {
+    ) -> SharedMemoryRegionHandle<T> {
         // println!("creating new shared mem region {:?} {:?}",size,alloc);
-        let mr_t: MemoryRegion<T> = MemoryRegion::try_new(size, team.lamellae.clone(), alloc)?;
-        let mr = unsafe { mr_t.to_base::<u8>() };
-        Ok(SharedMemoryRegion {
-            mr: Darc::try_new(team.clone(), mr, crate::darc::DarcMode::Darc)
-                .expect("memregions can only be created on a member of the team"),
-            sub_region_offset: 0,
-            sub_region_size: size,
-            phantom: PhantomData,
-        })
+
+        // Ok(SharedMemoryRegion {
+        //     mr: Darc::try_new(team.clone(), mr, crate::darc::DarcMode::Darc)
+        //         .expect("memregions can only be created on a member of the team"),
+        //     sub_region_offset: 0,
+        //     sub_region_size: size,
+        //     phantom: PhantomData,
+        // })
+
+        SharedMemoryRegionHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(async move {
+                team.async_barrier().await;
+                let mr_t: MemoryRegion<T> =
+                    MemoryRegion::try_new(size, team.lamellae.clone(), alloc)?;
+                let mr = unsafe { mr_t.to_base::<u8>() };
+                let res: Result<SharedMemoryRegion<T>, anyhow::Error> = Ok(SharedMemoryRegion {
+                    mr: Darc::async_try_new_with_drop(
+                        team.clone(),
+                        mr,
+                        crate::darc::DarcMode::Darc,
+                        None,
+                    )
+                    .await
+                    .expect("memregions can only be created on a member of the team"),
+                    sub_region_offset: 0,
+                    sub_region_size: size,
+                    phantom: PhantomData,
+                });
+                res
+            }),
+        }
     }
 }
 
diff --git a/tests/add.rs b/tests/add.rs
index 598b916b..15995de0 100644
--- a/tests/add.rs
+++ b/tests/add.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/add_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/and.rs b/tests/and.rs
index 33f1fc9b..7ff22aba 100644
--- a/tests/and.rs
+++ b/tests/and.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/and_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 387f44fa..4312e379 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -83,7 +83,7 @@ macro_rules! add_test{
             let _rand_idx = Uniform::from(0..array_total_len);
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let pe_max_val: $t = if std::any::TypeId::of::<$t>() == std::any::TypeId::of::<f32>(){
                 9 as $t
@@ -323,173 +323,194 @@ macro_rules! check_results {
     };
 }
 
-macro_rules! input_test{
-    ($array:ident,  $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
+macro_rules! input_test {
+    ($array:ident,  $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
 
-            // let mut success = true;
-            let array: $array::<usize> = $array::<usize>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
-            let input_array: UnsafeArray::<usize> = UnsafeArray::<usize>::new(world.team(), array_total_len*num_pes, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
-            let init_val=0;
-            initialize_array!($array, array, init_val);
-            #[allow(unused_unsafe)]
-            unsafe {
-                if $dist == lamellar::array::Distribution::Block{
-                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {
+        // let mut success = true;
+        let array: $array<usize> = $array::<usize>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
+        let input_array: UnsafeArray<usize> =
+            UnsafeArray::<usize>::new(world.team(), array_total_len * num_pes, $dist)
+                .block()
+                .into(); //convert into abstract LamellarArray, distributed len is total_len
+        let init_val = 0;
+        initialize_array!($array, array, init_val);
+        #[allow(unused_unsafe)]
+        unsafe {
+            if $dist == lamellar::array::Distribution::Block {
+                let _ = input_array
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each(move |(i, x)| {
                         // println!("i: {:?}",i);
-                        *x = i%array_total_len}
-                    ).block();
-                }
-                else{
-                    let _ = input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {
+                        *x = i % array_total_len
+                    })
+                    .block();
+            } else {
+                let _ = input_array
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each(move |(i, x)| {
                         //println!("i: {:?}",i);
-                        *x = i/num_pes}
-                    ).block();
-                }
-            }
-            input_array.barrier();
-            input_array.print();
-            //individual T------------------------------
-            for i in 0..array.len(){
-                #[allow(unused_unsafe)]
-                let _ =  unsafe{ array.batch_add(i,1).spawn()};
+                        *x = i / num_pes
+                    })
+                    .block();
             }
-            check_results!($array,array,num_pes,"T");
-            println!("passed T");
-            //individual T------------------------------
-            for i in 0..array.len(){
-                #[allow(unused_unsafe)]
-                let _ =  unsafe{ array.batch_add(&i,1).spawn()};
-            }
-            check_results!($array,array,num_pes,"&T");
-            println!("passed &T");
-            //&[T]------------------------------
-            let vec=(0..array.len()).collect::<Vec<usize>>();
+        }
+        input_array.barrier();
+        input_array.print();
+        //individual T------------------------------
+        for i in 0..array.len() {
+            #[allow(unused_unsafe)]
+            let _ = unsafe { array.batch_add(i, 1).spawn() };
+        }
+        check_results!($array, array, num_pes, "T");
+        println!("passed T");
+        //individual T------------------------------
+        for i in 0..array.len() {
+            #[allow(unused_unsafe)]
+            let _ = unsafe { array.batch_add(&i, 1).spawn() };
+        }
+        check_results!($array, array, num_pes, "&T");
+        println!("passed &T");
+        //&[T]------------------------------
+        let vec = (0..array.len()).collect::<Vec<usize>>();
+        let slice = &vec[..];
+        #[allow(unused_unsafe)]
+        let _ = unsafe { array.batch_add(slice, 1).spawn() };
+        check_results!($array, array, num_pes, "&[T]");
+        println!("passed &[T]");
+        //scoped &[T]------------------------------
+        {
+            let vec = (0..array.len()).collect::<Vec<usize>>();
             let slice = &vec[..];
             #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(slice,1).spawn()};
-            check_results!($array,array,num_pes,"&[T]");
-            println!("passed &[T]");
-            //scoped &[T]------------------------------
-            {
-                let vec=(0..array.len()).collect::<Vec<usize>>();
-                let slice = &vec[..];
-                #[allow(unused_unsafe)]
-                let _ =  unsafe{ array.batch_add(slice,1).spawn()};
-            }
-            check_results!($array,array,num_pes,"scoped &[T]");
-            println!("passed scoped &[T]");
-            // Vec<T>------------------------------
-            let vec=(0..array.len()).collect::<Vec<usize>>();
+            let _ = unsafe { array.batch_add(slice, 1).spawn() };
+        }
+        check_results!($array, array, num_pes, "scoped &[T]");
+        println!("passed scoped &[T]");
+        // Vec<T>------------------------------
+        let vec = (0..array.len()).collect::<Vec<usize>>();
+        #[allow(unused_unsafe)]
+        let _ = unsafe { array.batch_add(vec, 1).spawn() };
+        check_results!($array, array, num_pes, "Vec<T>");
+        println!("passed Vec<T>");
+        // &Vec<T>------------------------------
+        let vec = (0..array.len()).collect::<Vec<usize>>();
+        #[allow(unused_unsafe)]
+        let _ = unsafe { array.batch_add(&vec, 1).spawn() };
+        check_results!($array, array, num_pes, "&Vec<T>");
+        println!("passed &Vec<T>");
+        // Scoped Vec<T>------------------------------
+        {
+            let vec = (0..array.len()).collect::<Vec<usize>>();
             #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(vec,1).spawn()};
-            check_results!($array,array,num_pes,"Vec<T>");
-            println!("passed Vec<T>");
-            // &Vec<T>------------------------------
-            let vec=(0..array.len()).collect::<Vec<usize>>();
+            let _ = unsafe { array.batch_add(vec, 1).spawn() };
+        }
+        check_results!($array, array, num_pes, "scoped Vec<T>");
+        println!("passed scoped Vec<T>");
+        // Scoped &Vec<T>------------------------------
+        {
+            let vec = (0..array.len()).collect::<Vec<usize>>();
             #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(&vec,1).spawn()};
-            check_results!($array,array,num_pes,"&Vec<T>");
-            println!("passed &Vec<T>");
-            // Scoped Vec<T>------------------------------
-            {
-                let vec=(0..array.len()).collect::<Vec<usize>>();
-                #[allow(unused_unsafe)]
-                let _ =  unsafe{ array.batch_add(vec,1).spawn()};
-            }
-            check_results!($array,array,num_pes,"scoped Vec<T>");
-            println!("passed scoped Vec<T>");
-            // Scoped &Vec<T>------------------------------
-            {
-                let vec=(0..array.len()).collect::<Vec<usize>>();
-                #[allow(unused_unsafe)]
-                let _ =  unsafe{ array.batch_add(&vec,1).spawn()};
-            }
-            check_results!($array,array,num_pes,"scoped &Vec<T>");
-            println!("passed scoped &Vec<T>");
+            let _ = unsafe { array.batch_add(&vec, 1).spawn() };
+        }
+        check_results!($array, array, num_pes, "scoped &Vec<T>");
+        println!("passed scoped &Vec<T>");
 
-            // LMR<T>------------------------------
+        // LMR<T>------------------------------
 
-            unsafe{
-                let lmr=world.alloc_one_sided_mem_region(array.len());
-                let slice = lmr.as_mut_slice().unwrap();
-                for i in 0..array.len(){
-                    slice[i]=i;
-                }
-                let _ = array.batch_add(slice,1).spawn();
-                check_results!($array,array,num_pes,"LMR<T>");
-                println!("passed LMR<T>");
+        unsafe {
+            let lmr = world.alloc_one_sided_mem_region(array.len()).unwrap();
+            let slice = lmr.as_mut_slice().unwrap();
+            for i in 0..array.len() {
+                slice[i] = i;
             }
+            let _ = array.batch_add(slice, 1).spawn();
+            check_results!($array, array, num_pes, "LMR<T>");
+            println!("passed LMR<T>");
+        }
 
+        // SMR<T>------------------------------
+        unsafe {
+            let smr = world.alloc_shared_mem_region(array.len()).block().unwrap();
 
-            // SMR<T>------------------------------
-            unsafe{
-                let smr=world.alloc_shared_mem_region(array.len());
-
-                let slice = smr.as_mut_slice().unwrap();
-                for i in 0..array.len(){
-                    slice[i]=i;
-                }
-
-                let _ = array.batch_add(slice,1).spawn();
-                check_results!($array,array,num_pes,"SMR<T>");
-                println!("passed SMR<T>");
+            let slice = smr.as_mut_slice().unwrap();
+            for i in 0..array.len() {
+                slice[i] = i;
             }
 
-            // UnsafeArray<T>------------------------------
-            // array.add(input_array.clone(),1);
-            // check_results!($array,array,num_pes,"UnsafeArray<T>");
-            // UnsafeArray<T>------------------------------
-            #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(unsafe{input_array.local_data()},1).spawn()};
-            check_results!($array,array,num_pes,"&UnsafeArray<T>");
-            println!("passed &UnsafeArray<T>");
+            let _ = array.batch_add(slice, 1).spawn();
+            check_results!($array, array, num_pes, "SMR<T>");
+            println!("passed SMR<T>");
+        }
 
-            // ReadOnlyArray<T>------------------------------
-            let input_array = input_array.into_read_only();
-            // array.add(input_array.clone(),1);
-            // check_results!($array,array,num_pes,"ReadOnlyArray<T>");
-            // ReadOnlyArray<T>------------------------------
-            #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(input_array.local_data(),1).spawn()};
-            check_results!($array,array,num_pes,"&ReadOnlyArray<T>");
-            println!("passed &ReadOnlyArray<T>");
+        // UnsafeArray<T>------------------------------
+        // array.add(input_array.clone(),1);
+        // check_results!($array,array,num_pes,"UnsafeArray<T>");
+        // UnsafeArray<T>------------------------------
+        #[allow(unused_unsafe)]
+        let _ = unsafe {
+            array
+                .batch_add(unsafe { input_array.local_data() }, 1)
+                .spawn()
+        };
+        check_results!($array, array, num_pes, "&UnsafeArray<T>");
+        println!("passed &UnsafeArray<T>");
 
-            // AtomicArray<T>------------------------------
-            let input_array = input_array.into_atomic();
-            // array.add(input_array.clone(),1);
-            // check_results!($array,array,num_pes,"AtomicArray<T>");
-            // AtomicArray<T>------------------------------
-            #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(&input_array.local_data(),1).spawn()};
-            check_results!($array,array,num_pes,"&AtomicArray<T>");
-            println!("passed &AtomicArray<T>");
+        // ReadOnlyArray<T>------------------------------
+        let input_array = input_array.into_read_only();
+        // array.add(input_array.clone(),1);
+        // check_results!($array,array,num_pes,"ReadOnlyArray<T>");
+        // ReadOnlyArray<T>------------------------------
+        #[allow(unused_unsafe)]
+        let _ = unsafe { array.batch_add(input_array.local_data(), 1).spawn() };
+        check_results!($array, array, num_pes, "&ReadOnlyArray<T>");
+        println!("passed &ReadOnlyArray<T>");
 
-            // LocalLockArray<T>------------------------------
-            let input_array = input_array.into_local_lock();
-            //  array.add(input_array.clone(),1);
-            //  check_results!($array,array,num_pes,"LocalLockArray<T>");
-            // LocalLockArray<T>------------------------------
-            #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(&input_array.read_local_data().block(),1).spawn()};
-            check_results!($array,array,num_pes,"&LocalLockArray<T>");
-            println!("passed &LocalLockArray<T>");
+        // AtomicArray<T>------------------------------
+        let input_array = input_array.into_atomic();
+        // array.add(input_array.clone(),1);
+        // check_results!($array,array,num_pes,"AtomicArray<T>");
+        // AtomicArray<T>------------------------------
+        #[allow(unused_unsafe)]
+        let _ = unsafe { array.batch_add(&input_array.local_data(), 1).spawn() };
+        check_results!($array, array, num_pes, "&AtomicArray<T>");
+        println!("passed &AtomicArray<T>");
 
-            // GlobalLockArray<T>------------------------------
-            let input_array = input_array.into_global_lock();
-            //  array.add(input_array.clone(),1);
-            //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
-            // GlobalLockArray<T>------------------------------
-            #[allow(unused_unsafe)]
-            let _ =  unsafe{ array.batch_add(&input_array.read_local_data().block(),1).spawn()};
-            check_results!($array,array,num_pes,"&GlobalLockArray<T>");
-            println!("passed &GlobalLockArray<T>");
-       }
-    }
+        // LocalLockArray<T>------------------------------
+        let input_array = input_array.into_local_lock();
+        //  array.add(input_array.clone(),1);
+        //  check_results!($array,array,num_pes,"LocalLockArray<T>");
+        // LocalLockArray<T>------------------------------
+        #[allow(unused_unsafe)]
+        let _ = unsafe {
+            array
+                .batch_add(&input_array.read_local_data().block(), 1)
+                .spawn()
+        };
+        check_results!($array, array, num_pes, "&LocalLockArray<T>");
+        println!("passed &LocalLockArray<T>");
+
+        // GlobalLockArray<T>------------------------------
+        let input_array = input_array.into_global_lock();
+        //  array.add(input_array.clone(),1);
+        //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
+        // GlobalLockArray<T>------------------------------
+        #[allow(unused_unsafe)]
+        let _ = unsafe {
+            array
+                .batch_add(&input_array.read_local_data().block(), 1)
+                .spawn()
+        };
+        check_results!($array, array, num_pes, "&GlobalLockArray<T>");
+        println!("passed &GlobalLockArray<T>");
+    }};
 }
 
 fn main() {
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index 29d88632..66b28316 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -84,7 +84,7 @@ macro_rules! div_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let max_updates = max_updates!($t,num_pes);
             let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index b2048841..8a5c80cd 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -123,7 +123,7 @@ macro_rules! fetch_add_test{
             let rand_idx = Uniform::from(0..array_total_len);
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let pe_max_val: $t = 10 as $t;
             let max_val = pe_max_val * num_pes as $t;
@@ -435,8 +435,8 @@ macro_rules! input_test{
             let array_total_len = $len;
 
             // let mut success = true;
-            let array: $array::<usize> = $array::<usize>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
-            let input_array: UnsafeArray::<usize> = UnsafeArray::<usize>::new(world.team(), array_total_len*num_pes, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<usize> = $array::<usize>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+            let input_array: UnsafeArray::<usize> = UnsafeArray::<usize>::new(world.team(), array_total_len*num_pes, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
             // let init_val=0;
             initialize_array2!($array, array, init_val);
             if $dist == lamellar::array::Distribution::Block{
@@ -524,7 +524,7 @@ macro_rules! input_test{
             // scoped &LMR<T>------------------------------
             let mut reqs = vec![];
             unsafe {
-                let lmr=world.alloc_one_sided_mem_region(array.len());
+                let lmr=world.alloc_one_sided_mem_region(array.len()).unwrap();
                 let slice = lmr.as_mut_slice().unwrap();
                 for i in 0..array.len(){
                     slice[i]=i;
@@ -536,7 +536,7 @@ macro_rules! input_test{
             // scoped SMR<T>------------------------------
             let mut reqs = vec![];
             unsafe {
-                let smr=world.alloc_shared_mem_region(array.len());
+                let smr=world.alloc_shared_mem_region(array.len()).block().unwrap();
                 let slice = smr.as_mut_slice().unwrap();
                 for i in 0..array.len(){
                     slice[i]=i;
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index 427915a9..dd82db60 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -114,7 +114,7 @@ macro_rules! fetch_div_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let max_updates = max_updates!($t,num_pes);
             let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index 985954e9..76f901d9 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -109,7 +109,7 @@ macro_rules! fetch_mul_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let max_updates = max_updates!($t,num_pes);
             let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
diff --git a/tests/array/arithmetic_ops/fetch_rem_test.rs b/tests/array/arithmetic_ops/fetch_rem_test.rs
index 83b47839..4dddb209 100644
--- a/tests/array/arithmetic_ops/fetch_rem_test.rs
+++ b/tests/array/arithmetic_ops/fetch_rem_test.rs
@@ -114,7 +114,7 @@ macro_rules! fetch_rem_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let max_updates = max_updates!($t,num_pes);
             let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index 68ea6d04..eec3d49a 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -106,7 +106,7 @@ macro_rules! fetch_sub_test{
             let mut rng = rand::thread_rng();
             let rand_idx = Uniform::from(0..array_total_len);
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let pe_max_val: $t = 10 as $t;
             let max_val = pe_max_val * num_pes as $t;
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index 4f76ddb9..76a7d57f 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -92,7 +92,7 @@ macro_rules! mul_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let max_updates = max_updates!($t,num_pes);
             let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
diff --git a/tests/array/arithmetic_ops/rem_test.rs b/tests/array/arithmetic_ops/rem_test.rs
index daf07dde..562a9005 100644
--- a/tests/array/arithmetic_ops/rem_test.rs
+++ b/tests/array/arithmetic_ops/rem_test.rs
@@ -84,7 +84,7 @@ macro_rules! rem_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let max_updates = max_updates!($t,num_pes);
             let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index 2cd8382b..84d89aed 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -91,7 +91,7 @@ macro_rules! sub_test{
             let rand_idx = Uniform::from(0..array_total_len);
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let pe_max_val: $t = 100 as $t;
             let max_val = pe_max_val * num_pes as $t;
diff --git a/tests/array/array_into_test.rs b/tests/array/array_into_test.rs
index fc7e40b8..cb27c5b4 100644
--- a/tests/array/array_into_test.rs
+++ b/tests/array/array_into_test.rs
@@ -5,7 +5,7 @@ macro_rules! into_test {
         let _num_pes = world.num_pes();
         let _my_pe = world.my_pe();
 
-        let array = $array1::<u32>::new(world.clone(), 1000, Distribution::Block);
+        let array = $array1::<u32>::new(world.clone(), 1000, Distribution::Block).block();
         let _array2: $array2<u32> = array.into();
     }};
 }
diff --git a/tests/array/atomic_ops/compare_exchange_test.rs b/tests/array/atomic_ops/compare_exchange_test.rs
index 46d9f906..ddc36ed7 100644
--- a/tests/array/atomic_ops/compare_exchange_test.rs
+++ b/tests/array/atomic_ops/compare_exchange_test.rs
@@ -55,341 +55,365 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! compare_exchange_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
-
-            let init_val =(num_pes as $t);
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-
-            let mut reqs = vec![];
-            for idx in 0..array.len(){
-                if idx%num_pes == my_pe{
-                    #[allow(unused_unsafe)]
-                    reqs.push((unsafe{array.compare_exchange(idx,init_val, my_pe as $t)},idx));
-                }
+macro_rules! compare_exchange_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
+
+        let init_val = (num_pes as $t);
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+
+        let mut reqs = vec![];
+        for idx in 0..array.len() {
+            if idx % num_pes == my_pe {
+                #[allow(unused_unsafe)]
+                reqs.push((
+                    unsafe { array.compare_exchange(idx, init_val, my_pe as $t) },
+                    idx,
+                ));
             }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
-                    Ok(val) => {
-                        check_val!($array,val,init_val,success);
-                        if !success{
-                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
-                        }
-                    }
-                    Err(val) => {
-                        println!("returned error {:?} {:?} {:?}",idx,val,init_val);
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    check_val!($array, val, init_val, success);
+                    if !success {
+                        eprintln!("{:?} {:?} {:?}", idx, val, init_val);
                     }
                 }
+                Err(val) => {
+                    println!("returned error {:?} {:?} {:?}", idx, val, init_val);
+                }
             }
-            array.wait_all();
-            array.barrier();
-            let mut reqs = vec![];
-            for idx in 0..array.len(){ //these should all fail
+        }
+        array.wait_all();
+        array.barrier();
+        let mut reqs = vec![];
+        for idx in 0..array.len() {
+            //these should all fail
+            #[allow(unused_unsafe)]
+            reqs.push((
+                unsafe { array.compare_exchange(idx, init_val, my_pe as $t) },
+                idx,
+            ));
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    println!("returned ok {:?} {:?} {:?}", idx, val, init_val);
+                }
+                Err(_) => {}
+            }
+        }
+        array.barrier();
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        let mut reqs = vec![];
+        for idx in 0..sub_array.len() {
+            if idx % num_pes == my_pe {
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{array.compare_exchange(idx,init_val,my_pe as $t)},idx));
+                reqs.push((
+                    unsafe { sub_array.compare_exchange(idx, init_val, my_pe as $t) },
+                    idx,
+                ));
             }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
-                    Ok(val) => {
-                        println!("returned ok {:?} {:?} {:?}",idx,val,init_val);
-                    }
-                    Err(_) => {
-
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    check_val!($array, val, init_val, success);
+                    if !success {
+                        eprintln!("{:?} {:?} {:?}", idx, val, init_val);
                     }
                 }
+                Err(val) => {
+                    println!("returned error {:?} {:?} {:?}", idx, val, init_val);
+                }
             }
-            array.barrier();
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-
-
-
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        let mut reqs = vec![];
+        for idx in 0..sub_array.len() {
+            #[allow(unused_unsafe)]
+            reqs.push((
+                unsafe { sub_array.compare_exchange(idx, init_val, my_pe as $t) },
+                idx,
+            ));
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    println!("returned ok {:?} {:?} {:?}", idx, val, init_val);
+                }
+                Err(_) => {}
+            }
+        }
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
+        sub_array.wait_all();
+        sub_array.barrier();
+
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
             let mut reqs = vec![];
-            for idx in 0..sub_array.len(){
-                if idx%num_pes == my_pe{
+            for idx in 0..sub_array.len() {
+                if idx % num_pes == my_pe {
                     #[allow(unused_unsafe)]
-                    reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
+                    reqs.push((
+                        unsafe { sub_array.compare_exchange(idx, init_val, my_pe as $t) },
+                        idx,
+                    ));
                 }
             }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
+            for (req, idx) in reqs {
+                match world.block_on(req) {
                     Ok(val) => {
-                        check_val!($array,val,init_val,success);
-                        if !success{
-                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
+                        check_val!($array, val, init_val, success);
+                        if !success {
+                            eprintln!("{:?} {:?} {:?}", idx, val, init_val);
                         }
                     }
                     Err(val) => {
-                        println!("returned error {:?} {:?} {:?}",idx,val,init_val);
+                        println!("returned error {:?} {:?} {:?}", idx, val, init_val);
                     }
                 }
             }
             sub_array.wait_all();
             sub_array.barrier();
             let mut reqs = vec![];
-            for idx in 0..sub_array.len(){
-                #[allow(unused_unsafe)]
-                reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
+            for idx in 0..sub_array.len() {
+                if idx % num_pes == my_pe {
+                    #[allow(unused_unsafe)]
+                    reqs.push((
+                        unsafe { sub_array.compare_exchange(idx, init_val, my_pe as $t) },
+                        idx,
+                    ));
+                }
             }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
+            for (req, idx) in reqs {
+                match world.block_on(req) {
                     Ok(val) => {
-                        println!("returned ok {:?} {:?} {:?}",idx,val,init_val);
-                    }
-                    Err(_) => {
-
+                        println!("returned ok {:?} {:?} {:?}", idx, val, init_val);
                     }
+                    Err(_) => {}
                 }
             }
             sub_array.barrier();
             initialize_array!($array, array, init_val);
             sub_array.wait_all();
             sub_array.barrier();
+        }
 
+        if !success {
+            eprintln!("failed");
+        }
+    }};
+}
 
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                let mut reqs = vec![];
-                for idx in 0..sub_array.len(){
-                    if idx%num_pes == my_pe{
-                        #[allow(unused_unsafe)]
-                        reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
-                    }
-                }
-                for (req,idx) in reqs{
-                    match  world.block_on(req){
-                        Ok(val) => {
-                            check_val!($array,val,init_val,success);
-                            if !success{
-                                eprintln!("{:?} {:?} {:?}",idx,val,init_val);
-                            }
-                        }
-                        Err(val) => {
-                            println!("returned error {:?} {:?} {:?}",idx,val,init_val);
-                        }
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                let mut reqs = vec![];
-                for idx in 0..sub_array.len(){
-                    if idx%num_pes == my_pe{
-                        #[allow(unused_unsafe)]
-                        reqs.push((unsafe{sub_array.compare_exchange(idx,init_val,my_pe as $t)},idx));
+macro_rules! compare_exchange_epsilon_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
+
+        let init_val = (num_pes as $t);
+        let epsilon = 0.0001 as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+
+        let mut reqs = vec![];
+        for idx in 0..array.len() {
+            if idx % num_pes == my_pe {
+                #[allow(unused_unsafe)]
+                reqs.push((
+                    unsafe { array.compare_exchange_epsilon(idx, init_val, my_pe as $t, epsilon) },
+                    idx,
+                ));
+            }
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    check_val!($array, val, init_val, success);
+                    if !success {
+                        eprintln!("{:?} {:?} {:?}", idx, val, init_val);
                     }
                 }
-                for (req,idx) in reqs{
-                    match  world.block_on(req){
-                        Ok(val) => {
-                            println!("returned ok {:?} {:?} {:?}",idx,val,init_val);
-                        }
-                        Err(_) => {
-
-                        }
-                    }
+                Err(val) => {
+                    println!("returned error {:?} {:?} {:?}", idx, val, init_val);
                 }
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-                sub_array.wait_all();
-                sub_array.barrier();
-            }
-
-            if !success{
-                eprintln!("failed");
             }
         }
-    }
-}
-
-macro_rules! compare_exchange_epsilon_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
-
-            let init_val =(num_pes as $t);
-            let epsilon = 0.0001 as $t;
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-
-            let mut reqs = vec![];
-            for idx in 0..array.len(){
-                if idx%num_pes == my_pe{
-                    #[allow(unused_unsafe)]
-                    reqs.push((unsafe{array.compare_exchange_epsilon(idx,init_val, my_pe as $t,epsilon)},idx));
-                }
-            }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
-                    Ok(val) => {
-                        check_val!($array,val,init_val,success);
-                        if !success{
-                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
-                        }
-                    }
-                    Err(val) => {
-                        println!("returned error {:?} {:?} {:?}",idx,val,init_val);
-                    }
+        array.wait_all();
+        array.barrier();
+        let mut reqs = vec![];
+        for idx in 0..array.len() {
+            //these should all fail
+            #[allow(unused_unsafe)]
+            reqs.push((
+                unsafe { array.compare_exchange_epsilon(idx, init_val, my_pe as $t, epsilon) },
+                idx,
+            ));
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    println!("returned ok {:?} {:?} {:?}", idx, val, init_val);
                 }
+                Err(_) => {}
             }
-            array.wait_all();
-            array.barrier();
-            let mut reqs = vec![];
-            for idx in 0..array.len(){ //these should all fail
+        }
+        array.barrier();
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        let mut reqs = vec![];
+        for idx in 0..sub_array.len() {
+            if idx % num_pes == my_pe {
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
+                reqs.push((
+                    unsafe {
+                        sub_array.compare_exchange_epsilon(idx, init_val, my_pe as $t, epsilon)
+                    },
+                    idx,
+                ));
             }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
-                    Ok(val) => {
-                        println!("returned ok {:?} {:?} {:?}",idx,val,init_val);
-                    }
-                    Err(_) => {
-
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    check_val!($array, val, init_val, success);
+                    if !success {
+                        eprintln!("{:?} {:?} {:?}", idx, val, init_val);
                     }
                 }
+                Err(val) => {
+                    println!("returned error {:?} {:?} {:?}", idx, val, init_val);
+                }
             }
-            array.barrier();
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-
-
-
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        let mut reqs = vec![];
+        for idx in 0..sub_array.len() {
+            #[allow(unused_unsafe)]
+            reqs.push((
+                unsafe { sub_array.compare_exchange_epsilon(idx, init_val, my_pe as $t, epsilon) },
+                idx,
+            ));
+        }
+        for (req, idx) in reqs {
+            match world.block_on(req) {
+                Ok(val) => {
+                    println!("returned ok {:?} {:?} {:?}", idx, val, init_val);
+                }
+                Err(_) => {}
+            }
+        }
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
+        sub_array.wait_all();
+        sub_array.barrier();
+
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
             let mut reqs = vec![];
-            for idx in 0..sub_array.len(){
-                if idx%num_pes == my_pe{
+            for idx in 0..sub_array.len() {
+                if idx % num_pes == my_pe {
                     #[allow(unused_unsafe)]
-                    reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
+                    reqs.push((
+                        unsafe {
+                            sub_array.compare_exchange_epsilon(idx, init_val, my_pe as $t, epsilon)
+                        },
+                        idx,
+                    ));
                 }
             }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
+            for (req, idx) in reqs {
+                match world.block_on(req) {
                     Ok(val) => {
-                        check_val!($array,val,init_val,success);
-                        if !success{
-                            eprintln!("{:?} {:?} {:?}",idx,val,init_val);
+                        check_val!($array, val, init_val, success);
+                        if !success {
+                            eprintln!("{:?} {:?} {:?}", idx, val, init_val);
                         }
                     }
                     Err(val) => {
-                        println!("returned error {:?} {:?} {:?}",idx,val,init_val);
+                        println!("returned error {:?} {:?} {:?}", idx, val, init_val);
                     }
                 }
             }
             sub_array.wait_all();
             sub_array.barrier();
             let mut reqs = vec![];
-            for idx in 0..sub_array.len(){
-                #[allow(unused_unsafe)]
-                reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
+            for idx in 0..sub_array.len() {
+                if idx % num_pes == my_pe {
+                    #[allow(unused_unsafe)]
+                    reqs.push((
+                        unsafe {
+                            sub_array.compare_exchange_epsilon(idx, init_val, my_pe as $t, epsilon)
+                        },
+                        idx,
+                    ));
+                }
             }
-            for (req,idx) in reqs{
-                match  world.block_on(req){
+            for (req, idx) in reqs {
+                match world.block_on(req) {
                     Ok(val) => {
-                        println!("returned ok {:?} {:?} {:?}",idx,val,init_val);
-                    }
-                    Err(_) => {
-
+                        println!("returned ok {:?} {:?} {:?}", idx, val, init_val);
                     }
+                    Err(_) => {}
                 }
             }
             sub_array.barrier();
             initialize_array!($array, array, init_val);
             sub_array.wait_all();
             sub_array.barrier();
+        }
 
-
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                let mut reqs = vec![];
-                for idx in 0..sub_array.len(){
-                    if idx%num_pes == my_pe{
-                        #[allow(unused_unsafe)]
-                        reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
-                    }
-                }
-                for (req,idx) in reqs{
-                    match  world.block_on(req){
-                        Ok(val) => {
-                            check_val!($array,val,init_val,success);
-                            if !success{
-                                eprintln!("{:?} {:?} {:?}",idx,val,init_val);
-                            }
-                        }
-                        Err(val) => {
-                            println!("returned error {:?} {:?} {:?}",idx,val,init_val);
-                        }
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                let mut reqs = vec![];
-                for idx in 0..sub_array.len(){
-                    if idx%num_pes == my_pe{
-                        #[allow(unused_unsafe)]
-                        reqs.push((unsafe{sub_array.compare_exchange_epsilon(idx,init_val,my_pe as $t,epsilon)},idx));
-                    }
-                }
-                for (req,idx) in reqs{
-                    match  world.block_on(req){
-                        Ok(val) => {
-                            println!("returned ok {:?} {:?} {:?}",idx,val,init_val);
-                        }
-                        Err(_) => {
-
-                        }
-                    }
-                }
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-                sub_array.wait_all();
-                sub_array.barrier();
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 macro_rules! check_input {
@@ -422,28 +446,28 @@ macro_rules! check_input {
     };
 }
 
-macro_rules! input_test{
-    ($array:ident,  $len:expr, $dist:ident) =>{
-       {
-            std::env::set_var("LAMELLAR_BATCH_OP_SIZE","10");
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let my_pe = world.my_pe();
-            let array_total_len = $len;
-
-            // let mut success = true;
-            let array: $array::<usize> = $array::<usize>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
-            let init_val = num_pes;
-            initialize_array!($array, array, init_val);
-            let idxs = (my_pe..array.len()).step_by(num_pes).collect::<Vec<_>>();
-            let full_idxs = (0..array.len()).collect::<Vec<_>>();
-            let req = array.batch_compare_exchange(idxs,num_pes,my_pe);
-            check_input!(array,req);
-            let req = array.batch_compare_exchange(full_idxs,my_pe,my_pe);
-            check_input!(array,req,$array,num_pes,my_pe);
-            initialize_array!($array, array, init_val);
-       }
-    }
+macro_rules! input_test {
+    ($array:ident,  $len:expr, $dist:ident) => {{
+        std::env::set_var("LAMELLAR_BATCH_OP_SIZE", "10");
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let my_pe = world.my_pe();
+        let array_total_len = $len;
+
+        // let mut success = true;
+        let array: $array<usize> = $array::<usize>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
+        let init_val = num_pes;
+        initialize_array!($array, array, init_val);
+        let idxs = (my_pe..array.len()).step_by(num_pes).collect::<Vec<_>>();
+        let full_idxs = (0..array.len()).collect::<Vec<_>>();
+        let req = array.batch_compare_exchange(idxs, num_pes, my_pe);
+        check_input!(array, req);
+        let req = array.batch_compare_exchange(full_idxs, my_pe, my_pe);
+        check_input!(array, req, $array, num_pes, my_pe);
+        initialize_array!($array, array, init_val);
+    }};
 }
 
 fn main() {
diff --git a/tests/array/atomic_ops/load_store_test.rs b/tests/array/atomic_ops/load_store_test.rs
index de4993eb..25a6960b 100644
--- a/tests/array/atomic_ops/load_store_test.rs
+++ b/tests/array/atomic_ops/load_store_test.rs
@@ -64,77 +64,112 @@ macro_rules! check_val {
 //     };
 // }
 
-macro_rules! load_store_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
-
-            let init_val =(num_pes as $t);
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            for idx in 0..array.len(){
-                if idx%num_pes == my_pe{
-                    #[allow(unused_unsafe)]
-                    let _ = unsafe{array.store(idx,my_pe as $t).spawn()};
-                }
-            }
-            array.wait_all();
-            array.barrier();
-            let mut reqs = vec![];
-            for idx in 0..array.len(){
+macro_rules! load_store_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
+
+        let init_val = (num_pes as $t);
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        for idx in 0..array.len() {
+            if idx % num_pes == my_pe {
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{array.load(idx)},idx));
+                let _ = unsafe { array.store(idx, my_pe as $t).spawn() };
             }
-            for (req,idx) in reqs{
-                let val =  world.block_on(req);
-                let check_val = (idx%num_pes) as $t;
-                let val = val;
-                check_val!($array,val,check_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
-                }
+        }
+        array.wait_all();
+        array.barrier();
+        let mut reqs = vec![];
+        for idx in 0..array.len() {
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { array.load(idx) }, idx));
+        }
+        for (req, idx) in reqs {
+            let val = world.block_on(req);
+            let check_val = (idx % num_pes) as $t;
+            let val = val;
+            check_val!($array, val, check_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", idx, val, check_val);
             }
+        }
 
-            array.barrier();
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
+        array.barrier();
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        for idx in 0..sub_array.len() {
+            if idx % num_pes == my_pe {
+                #[allow(unused_unsafe)]
+                let _ = unsafe { sub_array.store(idx, my_pe as $t).spawn() };
+            }
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
 
+        let mut reqs = vec![];
+        for idx in 0..sub_array.len() {
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { sub_array.load(idx) }, idx));
+        }
+        for (req, idx) in reqs {
+            let val = world.block_on(req);
+            let check_val = (idx % num_pes) as $t;
+            let val = val;
+            check_val!($array, val, check_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", idx, val, check_val);
+            }
+        }
 
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
+        sub_array.wait_all();
+        sub_array.barrier();
 
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
-            for idx in 0..sub_array.len(){
-                if idx%num_pes == my_pe{
+            for idx in 0..sub_array.len() {
+                if idx % num_pes == my_pe {
                     #[allow(unused_unsafe)]
-                    let _ = unsafe{sub_array.store(idx,my_pe as $t).spawn()};
+                    let _ = unsafe { sub_array.store(idx, my_pe as $t).spawn() };
                 }
             }
             sub_array.wait_all();
             sub_array.barrier();
 
             let mut reqs = vec![];
-            for idx in 0..sub_array.len(){
+            for idx in 0..sub_array.len() {
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{sub_array.load(idx)},idx));
+                reqs.push((unsafe { sub_array.load(idx) }, idx));
             }
-            for (req,idx) in reqs{
-                let val =  world.block_on(req);
-                let check_val = (idx%num_pes) as $t;
+            for (req, idx) in reqs {
+                let val = world.block_on(req);
+                let check_val = (idx % num_pes) as $t;
                 let val = val;
-                check_val!($array,val,check_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
+                check_val!($array, val, check_val, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", idx, val, check_val);
                 }
             }
 
@@ -142,51 +177,12 @@ macro_rules! load_store_test{
             initialize_array!($array, array, init_val);
             sub_array.wait_all();
             sub_array.barrier();
+        }
 
-
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                for idx in 0..sub_array.len(){
-                    if idx%num_pes == my_pe{
-                        #[allow(unused_unsafe)]
-                        let _ = unsafe{sub_array.store(idx,my_pe as $t).spawn()};
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-
-                let mut reqs = vec![];
-                for idx in 0..sub_array.len(){
-                    #[allow(unused_unsafe)]
-                    reqs.push((unsafe{sub_array.load(idx)},idx));
-                }
-                for (req,idx) in reqs{
-                    let val =  world.block_on(req);
-                    let check_val = (idx%num_pes) as $t;
-                    let val = val;
-                check_val!($array,val,check_val,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",idx,val,check_val);
-                    }
-                }
-
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-                sub_array.wait_all();
-                sub_array.barrier();
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {
diff --git a/tests/array/atomic_ops/swap_test.rs b/tests/array/atomic_ops/swap_test.rs
index 2e71e926..f1139fb0 100644
--- a/tests/array/atomic_ops/swap_test.rs
+++ b/tests/array/atomic_ops/swap_test.rs
@@ -64,7 +64,7 @@ macro_rules! swap{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let init_val =(num_pes as $t);
             initialize_array!($array, array, init_val);
diff --git a/tests/array/bitwise_ops/and_test.rs b/tests/array/bitwise_ops/and_test.rs
index 82105b15..9ea07985 100644
--- a/tests/array/bitwise_ops/and_test.rs
+++ b/tests/array/bitwise_ops/and_test.rs
@@ -75,7 +75,7 @@ macro_rules! and_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let init_val =!(0 as $t);
             let final_val = init_val << num_pes;
diff --git a/tests/array/bitwise_ops/fetch_and_test.rs b/tests/array/bitwise_ops/fetch_and_test.rs
index 25fcdce1..b4b7c42c 100644
--- a/tests/array/bitwise_ops/fetch_and_test.rs
+++ b/tests/array/bitwise_ops/fetch_and_test.rs
@@ -75,7 +75,7 @@ macro_rules! fetch_and_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let init_val =!(0 as $t);
             let final_val = init_val << num_pes;
diff --git a/tests/array/bitwise_ops/fetch_or_test.rs b/tests/array/bitwise_ops/fetch_or_test.rs
index 22739189..9bd0f4ed 100644
--- a/tests/array/bitwise_ops/fetch_or_test.rs
+++ b/tests/array/bitwise_ops/fetch_or_test.rs
@@ -75,7 +75,7 @@ macro_rules! fetch_or_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let init_val =0 as $t;
             let final_val = !(!init_val << num_pes);
diff --git a/tests/array/bitwise_ops/fetch_xor_test.rs b/tests/array/bitwise_ops/fetch_xor_test.rs
index 99e4a45f..5b8a44a4 100644
--- a/tests/array/bitwise_ops/fetch_xor_test.rs
+++ b/tests/array/bitwise_ops/fetch_xor_test.rs
@@ -75,7 +75,7 @@ macro_rules! fetch_xor_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let init_val =0 as $t;
             let final_val = !(!init_val << num_pes);
diff --git a/tests/array/bitwise_ops/or_test.rs b/tests/array/bitwise_ops/or_test.rs
index 43c7bc01..bee5f8db 100644
--- a/tests/array/bitwise_ops/or_test.rs
+++ b/tests/array/bitwise_ops/or_test.rs
@@ -75,7 +75,7 @@ macro_rules! or_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let init_val =0 as $t;
             let final_val = !(!init_val << num_pes);
diff --git a/tests/array/bitwise_ops/xor_test.rs b/tests/array/bitwise_ops/xor_test.rs
index 24754011..fb0c976e 100644
--- a/tests/array/bitwise_ops/xor_test.rs
+++ b/tests/array/bitwise_ops/xor_test.rs
@@ -73,7 +73,7 @@ macro_rules! xor_test{
             let array_total_len = $len;
             #[allow(unused_mut)]
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
             let init_val =0 as $t;
             let final_val = !(!init_val << num_pes);
diff --git a/tests/array/local_only/clone.rs b/tests/array/local_only/clone.rs
index d81115e2..ce8b8dc3 100644
--- a/tests/array/local_only/clone.rs
+++ b/tests/array/local_only/clone.rs
@@ -2,6 +2,6 @@ use lamellar::array::prelude::*;
 const ARRAY_LEN: usize = 100;
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
-    let mut block_array = LocalOnlyArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let mut block_array = LocalOnlyArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
     let mut cloned_block_array = block_array.clone();
 }
\ No newline at end of file
diff --git a/tests/array/local_only/immutable_borrow.rs b/tests/array/local_only/immutable_borrow.rs
index d76bc8d7..a4c53b33 100644
--- a/tests/array/local_only/immutable_borrow.rs
+++ b/tests/array/local_only/immutable_borrow.rs
@@ -2,7 +2,7 @@ use lamellar::array::prelude::*;
 const ARRAY_LEN: usize = 100;
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
-    let mut block_array = LocalOnlyArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block);
+    let mut block_array = LocalOnlyArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
 
     let block_slice = block_array.as_slice();
     let _mut_block_slice = block_array.as_mut_slice();
diff --git a/tests/array/rdma/blocking_get_test.rs b/tests/array/rdma/blocking_get_test.rs
index 7d41d5e9..9fc83a93 100644
--- a/tests/array/rdma/blocking_get_test.rs
+++ b/tests/array/rdma/blocking_get_test.rs
@@ -125,9 +125,9 @@ macro_rules! blocking_get_test{
             let mem_seg_len = array_total_len;
             let mut success = true;
             #[allow(unused_mut)]
-            let mut array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let mut array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
+            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().unwrap().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
             //initialize array
             initialize_array!($array, array, $t);
             array.wait_all();
diff --git a/tests/array/rdma/get_test.rs b/tests/array/rdma/get_test.rs
index e14dc414..d3608ec0 100644
--- a/tests/array/rdma/get_test.rs
+++ b/tests/array/rdma/get_test.rs
@@ -125,10 +125,10 @@ macro_rules! get_test{
             #[allow(unused_mut)]
             let mut success = true;
             #[allow(unused_mut)]
-            let mut array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let mut array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
             // println!("bout to initialize");
             initialize_array!($array, array, $t);
-            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
+            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().unwrap().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
             //initialize array
 
             array.wait_all();
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index c7c62e90..3aefb3bd 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -62,9 +62,9 @@ macro_rules! put_test{
             let array_total_len = $len;
             let mem_seg_len = array_total_len;
             let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).into(); //convert into abstract LamellarArray, distributed len is total_len
+            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
+            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().unwrap().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
             //initialize array
             let init_val = my_pe as $t;
             initialize_array!($array, array, init_val);
diff --git a/tests/blocking_get.rs b/tests/blocking_get.rs
index 63a04e75..60636b46 100644
--- a/tests/blocking_get.rs
+++ b/tests/blocking_get.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/blocking_get_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/compare_exchange.rs b/tests/compare_exchange.rs
index e8bbaf27..bed1c09e 100644
--- a/tests/compare_exchange.rs
+++ b/tests/compare_exchange.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/compare_exchange_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/div.rs b/tests/div.rs
index e436a16f..f397d026 100644
--- a/tests/div.rs
+++ b/tests/div.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/div_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_add.rs b/tests/fetch_add.rs
index 9bf729ff..678571c1 100644
--- a/tests/fetch_add.rs
+++ b/tests/fetch_add.rs
@@ -17,7 +17,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_add_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
@@ -39,7 +39,7 @@ macro_rules! create_test {
                     .arg("--mpi=pmi2")
                     .arg("./target/release/examples/fetch_add_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_and.rs b/tests/fetch_and.rs
index ac4ec00e..e773e402 100644
--- a/tests/fetch_and.rs
+++ b/tests/fetch_and.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_and_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_div.rs b/tests/fetch_div.rs
index 2aaaea76..8634d580 100644
--- a/tests/fetch_div.rs
+++ b/tests/fetch_div.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_div_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_mul.rs b/tests/fetch_mul.rs
index 3414d4b0..1922185b 100644
--- a/tests/fetch_mul.rs
+++ b/tests/fetch_mul.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_mul_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_or.rs b/tests/fetch_or.rs
index 61990a7f..c39ce267 100644
--- a/tests/fetch_or.rs
+++ b/tests/fetch_or.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_or_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_rem.rs b/tests/fetch_rem.rs
index 6b73b1a1..5d6794a4 100644
--- a/tests/fetch_rem.rs
+++ b/tests/fetch_rem.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_rem_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_sub.rs b/tests/fetch_sub.rs
index 72b8754f..814369ad 100644
--- a/tests/fetch_sub.rs
+++ b/tests/fetch_sub.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_sub_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_xor.rs b/tests/fetch_xor.rs
index bc935831..d4c40471 100644
--- a/tests/fetch_xor.rs
+++ b/tests/fetch_xor.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_xor_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/get.rs b/tests/get.rs
index 42042f22..6b96dd8d 100644
--- a/tests/get.rs
+++ b/tests/get.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/get_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/load_store.rs b/tests/load_store.rs
index 6ec8e0d6..252f65c6 100644
--- a/tests/load_store.rs
+++ b/tests/load_store.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/load_store_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/mul.rs b/tests/mul.rs
index 4a8bd5e7..a16af4fa 100644
--- a/tests/mul.rs
+++ b/tests/mul.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/mul_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/or.rs b/tests/or.rs
index 59226374..796ee59b 100644
--- a/tests/or.rs
+++ b/tests/or.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/or_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/put.rs b/tests/put.rs
index 5408561d..b67e4d99 100644
--- a/tests/put.rs
+++ b/tests/put.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/put_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/rem.rs b/tests/rem.rs
index 3867436d..9644494b 100644
--- a/tests/rem.rs
+++ b/tests/rem.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/rem_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/sub.rs b/tests/sub.rs
index 509f76a5..71777929 100644
--- a/tests/sub.rs
+++ b/tests/sub.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/sub_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/swap.rs b/tests/swap.rs
index e77160c6..e9db89d5 100644
--- a/tests/swap.rs
+++ b/tests/swap.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/swap_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/xor.rs b/tests/xor.rs
index 25bb66cc..758bae6c 100644
--- a/tests/xor.rs
+++ b/tests/xor.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/xor_test")
                     .arg(stringify!($array))
-                    .arg($dist)
+                    .arg($dist).block();
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();

From 387270039c4289b763246f27f0028044acf9f804 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 7 Nov 2024 21:16:22 -0800
Subject: [PATCH 104/116] add a try_alloc api for memregions, fix examples and
 docs

---
 .../am_local_memregions.rs                    |   2 +-
 examples/array_examples/array_am.rs           |   4 +-
 examples/array_examples/array_put_get.rs      |   4 +-
 examples/array_examples/dist_array_reduce.rs  |   2 +-
 examples/bandwidths/am_bw_get.rs              |   6 +-
 examples/bandwidths/am_group_bw_get.rs        |   6 +-
 examples/bandwidths/atomic_array_get_bw.rs    |   2 +-
 examples/bandwidths/atomic_array_put_bw.rs    |   2 +-
 examples/bandwidths/get_bw.rs                 |   5 +-
 .../global_lock_atomic_array_get_bw.rs        |   2 +-
 .../global_lock_atomic_array_put_bw.rs        |   2 +-
 .../local_lock_atomic_array_get_bw.rs         |   2 +-
 .../local_lock_atomic_array_put_bw.rs         |   2 +-
 examples/bandwidths/put_bw.rs                 |   5 +-
 examples/bandwidths/readonly_array_get_bw.rs  |   2 +-
 .../readonly_array_get_unchecked_bw.rs        |   2 +-
 examples/bandwidths/unsafe_array_get_bw.rs    |   2 +-
 .../unsafe_array_get_unchecked_bw.rs          |   2 +-
 examples/bandwidths/unsafe_array_put_bw.rs    |   2 +-
 .../unsafe_array_put_unchecked_bw.rs          |   2 +-
 examples/bandwidths/unsafe_array_store_bw.rs  |   2 +-
 examples/kernels/am_gemm.rs                   |  13 +-
 examples/kernels/cached_am_gemm.rs            |  10 +-
 examples/kernels/dft_proxy.rs                 |  15 +-
 examples/misc/ping_pong.rs                    |  15 +-
 examples/rdma_examples/rdma_am.rs             |   8 +-
 examples/rdma_examples/rdma_get.rs            |   4 +-
 examples/rdma_examples/rdma_put.rs            |   4 +-
 src/array.rs                                  | 143 +---------
 src/array/atomic.rs                           |   6 +-
 src/array/atomic/handle.rs                    |   4 +-
 src/array/generic_atomic/handle.rs            |  39 +--
 src/array/generic_atomic/rdma.rs              |   2 +-
 src/array/global_lock_atomic.rs               |   4 +-
 src/array/global_lock_atomic/handle.rs        |   2 +-
 src/array/global_lock_atomic/rdma.rs          |   2 +-
 src/array/iterator/distributed_iterator.rs    |  16 +-
 src/array/iterator/local_iterator.rs          |  16 +-
 src/array/iterator/one_sided_iterator.rs      |   2 +-
 .../iterator/one_sided_iterator/chunks.rs     |   2 +-
 src/array/local_lock_atomic.rs                |   4 +-
 src/array/local_lock_atomic/handle.rs         |   2 +-
 src/array/local_lock_atomic/rdma.rs           |   2 +-
 src/array/native_atomic/handle.rs             |  40 +--
 src/array/native_atomic/rdma.rs               |   2 +-
 src/array/read_only.rs                        |   2 +-
 src/array/read_only/handle.rs                 |   2 +-
 src/array/unsafe.rs                           |   2 +-
 src/array/unsafe/handle.rs                    |   2 +-
 src/array/unsafe/rdma.rs                      |  10 +-
 src/darc.rs                                   |   2 +-
 src/lamellar_team.rs                          |  85 +++---
 src/lamellar_world.rs                         |  18 +-
 src/memregion.rs                              |  79 ++++--
 src/memregion/handle.rs                       | 257 ++++++++++++++++++
 src/memregion/shared.rs                       |  50 +++-
 tests/array/arithmetic_ops/add_test.rs        |   4 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  |   4 +-
 tests/array/rdma/blocking_get_test.rs         |   2 +-
 tests/array/rdma/get_test.rs                  |   2 +-
 tests/array/rdma/put_test.rs                  |   2 +-
 61 files changed, 542 insertions(+), 398 deletions(-)
 create mode 100644 src/memregion/handle.rs

diff --git a/examples/active_message_examples/am_local_memregions.rs b/examples/active_message_examples/am_local_memregions.rs
index 1a2fa8cc..36e1a2bf 100644
--- a/examples/active_message_examples/am_local_memregions.rs
+++ b/examples/active_message_examples/am_local_memregions.rs
@@ -50,7 +50,7 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_one_sided_mem_region::<u8>(10).expect("Enough memory should exist");
+    let array = world.alloc_one_sided_mem_region::<u8>(10);
 
     let mut rng = rand::thread_rng();
     let pes = Uniform::from(0..num_pes);
diff --git a/examples/array_examples/array_am.rs b/examples/array_examples/array_am.rs
index a9a0ceb1..ce9fec87 100644
--- a/examples/array_examples/array_am.rs
+++ b/examples/array_examples/array_am.rs
@@ -32,7 +32,7 @@ impl LamellarAM for RdmaAM {
         });
 
         //get the original nodes data
-        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
         local_slice[ARRAY_LEN - 1] = num_pes as u8;
         unsafe {
@@ -67,7 +67,7 @@ fn main() {
     println!("creating array");
     let array = UnsafeArray::<u8>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
     println!("creating memregion");
-    let local_mem_region = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let local_mem_region = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     println!("about to initialize array");
     array.print();
     if my_pe == 0 {
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index 44c62f05..b76ff8b2 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -34,11 +34,11 @@ fn main() {
         let shared_mem_region = world
             .alloc_shared_mem_region(total_len)
             .await
-            .expect("Enough memory should exist")
+            
             .into(); //Convert into abstract LamellarMemoryRegion
         let local_mem_region = world
             .alloc_one_sided_mem_region(total_len)
-            .expect("Enough memory should exist")
+            
             .into();
         initialize_array(&block_array).await;
         initialize_array(&cyclic_array).await;
diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index 277d8ceb..d028ba85 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -40,7 +40,7 @@ fn main() {
         UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Block).block();
     let cyclic_array =
         UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Cyclic).block();
-    let local_mem_region = world.alloc_one_sided_mem_region(total_len).expect("Enough memory should exist");
+    let local_mem_region = world.alloc_one_sided_mem_region(total_len);
     world.barrier();
     if my_pe == 0 {
         unsafe {
diff --git a/examples/bandwidths/am_bw_get.rs b/examples/bandwidths/am_bw_get.rs
index 5b3e599c..aa26857e 100644
--- a/examples/bandwidths/am_bw_get.rs
+++ b/examples/bandwidths/am_bw_get.rs
@@ -25,7 +25,7 @@ impl LamellarAM for DataAM {
     async fn exec(&self) {
         unsafe {
             // let local = lamellar::team.local_array::<u8>(self.length, 255u8);
-            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length).expect("Enough memory should exist");
+            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length);
             let local_slice = local.as_mut_slice().unwrap();
             local_slice[self.length - 1] = 255u8;
             self.array.get_unchecked(self.index, local.clone());
@@ -42,8 +42,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/am_group_bw_get.rs b/examples/bandwidths/am_group_bw_get.rs
index 3bc2e768..2bdc358e 100644
--- a/examples/bandwidths/am_group_bw_get.rs
+++ b/examples/bandwidths/am_group_bw_get.rs
@@ -25,7 +25,7 @@ impl LamellarAM for DataAM {
     async fn exec(&self) {
         unsafe {
             // let local = lamellar::team.local_array::<u8>(self.length, 255u8);
-            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length).expect("Enough memory should exist");
+            let local = lamellar::team.alloc_one_sided_mem_region::<u8>(self.length);
             let local_slice = local.as_mut_slice().unwrap();
             local_slice[self.length - 1] = 255u8;
             self.array.get_unchecked(self.index, local.clone());
@@ -42,8 +42,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/atomic_array_get_bw.rs b/examples/bandwidths/atomic_array_get_bw.rs
index 8b81978d..7120c97d 100644
--- a/examples/bandwidths/atomic_array_get_bw.rs
+++ b/examples/bandwidths/atomic_array_get_bw.rs
@@ -14,7 +14,7 @@ fn main() {
     let num_pes = world.num_pes();
     let array: LocalLockArray<u8> =
         LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/atomic_array_put_bw.rs b/examples/bandwidths/atomic_array_put_bw.rs
index f43c539a..393dbee4 100644
--- a/examples/bandwidths/atomic_array_put_bw.rs
+++ b/examples/bandwidths/atomic_array_put_bw.rs
@@ -14,7 +14,7 @@ fn main() {
     let num_pes = world.num_pes();
     let array: LocalLockArray<u8> =
         LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/get_bw.rs b/examples/bandwidths/get_bw.rs
index 50f08725..2cd427b7 100644
--- a/examples/bandwidths/get_bw.rs
+++ b/examples/bandwidths/get_bw.rs
@@ -14,9 +14,8 @@ fn main() {
     let num_pes = world.num_pes();
     let mem_reg = world
         .alloc_shared_mem_region::<u8>(MEMREG_LEN)
-        .block()
-        .unwrap();
-    let data = world.alloc_one_sided_mem_region::<u8>(MEMREG_LEN).expect("Enough memory should exist");
+        .block();
+    let data = world.alloc_one_sided_mem_region::<u8>(MEMREG_LEN);
     for j in 0..MEMREG_LEN as usize {
         unsafe {
             data.as_mut_slice().unwrap()[j] = my_pe as u8;
diff --git a/examples/bandwidths/global_lock_atomic_array_get_bw.rs b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
index c4825af5..dde20842 100644
--- a/examples/bandwidths/global_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_get_bw.rs
@@ -15,7 +15,7 @@ fn main() {
 
     let array: GlobalLockArray<u8> =
         GlobalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/global_lock_atomic_array_put_bw.rs b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
index d62cc46f..6f5d9718 100644
--- a/examples/bandwidths/global_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/global_lock_atomic_array_put_bw.rs
@@ -15,7 +15,7 @@ fn main() {
     let num_pes = world.num_pes();
     let array: GlobalLockArray<u8> =
         GlobalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/local_lock_atomic_array_get_bw.rs b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
index 0616cd25..f7984d3c 100644
--- a/examples/bandwidths/local_lock_atomic_array_get_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_get_bw.rs
@@ -15,7 +15,7 @@ fn main() {
 
     let array: LocalLockArray<u8> =
         LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/local_lock_atomic_array_put_bw.rs b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
index 83463cc7..4c98d164 100644
--- a/examples/bandwidths/local_lock_atomic_array_put_bw.rs
+++ b/examples/bandwidths/local_lock_atomic_array_put_bw.rs
@@ -15,7 +15,7 @@ fn main() {
     let num_pes = world.num_pes();
     let array: LocalLockArray<u8> =
         LocalLockArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/put_bw.rs b/examples/bandwidths/put_bw.rs
index 4fc58b65..8cc2bed2 100644
--- a/examples/bandwidths/put_bw.rs
+++ b/examples/bandwidths/put_bw.rs
@@ -13,11 +13,10 @@ fn main() {
     let num_pes = world.num_pes();
     let array = world
         .alloc_shared_mem_region::<u8>(ARRAY_LEN)
-        .block()
-        .unwrap();
+        .block();
     let data = world
         .alloc_one_sided_mem_region::<u8>(ARRAY_LEN)
-        .expect("Enough memory should exist");
+        ;
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/readonly_array_get_bw.rs b/examples/bandwidths/readonly_array_get_bw.rs
index fc200b05..5055b9ed 100644
--- a/examples/bandwidths/readonly_array_get_bw.rs
+++ b/examples/bandwidths/readonly_array_get_bw.rs
@@ -14,7 +14,7 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/readonly_array_get_unchecked_bw.rs b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
index f8a39053..76c27105 100644
--- a/examples/bandwidths/readonly_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
@@ -13,7 +13,7 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_get_bw.rs b/examples/bandwidths/unsafe_array_get_bw.rs
index 7b99bb43..e925dcc0 100644
--- a/examples/bandwidths/unsafe_array_get_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_bw.rs
@@ -14,7 +14,7 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
index 448160d0..cacf9910 100644
--- a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
@@ -13,7 +13,7 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_put_bw.rs b/examples/bandwidths/unsafe_array_put_bw.rs
index b4cc0212..97fc60b3 100644
--- a/examples/bandwidths/unsafe_array_put_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_bw.rs
@@ -13,7 +13,7 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_put_unchecked_bw.rs b/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
index c8d425f8..20735c8b 100644
--- a/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
@@ -13,7 +13,7 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_store_bw.rs b/examples/bandwidths/unsafe_array_store_bw.rs
index c6466855..0316c4fc 100644
--- a/examples/bandwidths/unsafe_array_store_bw.rs
+++ b/examples/bandwidths/unsafe_array_store_bw.rs
@@ -14,7 +14,7 @@ fn main() {
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
     let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
-    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/kernels/am_gemm.rs b/examples/kernels/am_gemm.rs
index 61396fcd..021fda64 100644
--- a/examples/kernels/am_gemm.rs
+++ b/examples/kernels/am_gemm.rs
@@ -97,8 +97,8 @@ struct NaiveMM {
 #[lamellar::am]
 impl LamellarAM for NaiveMM {
     async fn exec() {
-        let a = lamellar::world.alloc_one_sided_mem_region(self.a.block_size * self.a.block_size).expect("Enough memory should exist"); //the tile for the A matrix
-        let b = lamellar::world.alloc_one_sided_mem_region(self.b.block_size * self.b.block_size).expect("Enough memory should exist"); //the tile for the B matrix
+        let a = lamellar::world.alloc_one_sided_mem_region(self.a.block_size * self.a.block_size); //the tile for the A matrix
+        let b = lamellar::world.alloc_one_sided_mem_region(self.b.block_size * self.b.block_size); //the tile for the B matrix
         let b_fut = get_sub_mat(&self.b, &b); //b is remote so we will launch "gets" for this data first
         let a_fut = get_sub_mat(&self.a, &a);
         let a_b_fut = future::join(a_fut, b_fut);
@@ -164,16 +164,13 @@ fn main() {
 
     let a = world
         .alloc_shared_mem_region::<f32>((m * n) / num_pes)
-        .block()
-        .unwrap();
+        .block();
     let b = world
         .alloc_shared_mem_region::<f32>((n * p) / num_pes)
-        .block()
-        .unwrap();
+        .block();
     let c = world
         .alloc_shared_mem_region::<f32>((m * p) / num_pes)
-        .block()
-        .unwrap();
+        .block();
     unsafe {
         let mut cnt = (((m * n) / num_pes) * my_pe) as f32;
         for elem in a.as_mut_slice().unwrap() {
diff --git a/examples/kernels/cached_am_gemm.rs b/examples/kernels/cached_am_gemm.rs
index 90473df9..f784e82c 100644
--- a/examples/kernels/cached_am_gemm.rs
+++ b/examples/kernels/cached_am_gemm.rs
@@ -105,7 +105,7 @@ impl LamellarAM for MatMulAM {
     async fn exec() {
         let b = lamellar::world
             .alloc_one_sided_mem_region::<f32>(self.b.block_size * self.b.block_size)
-            .expect("enough memory exists");
+            ;
         get_sub_mat(&self.b, &b).await;
         // we dont actually want to alloc a shared memory region as there is an implicit barrier here
         // introduces sync point and potential for deadlock
@@ -122,7 +122,7 @@ impl LamellarAM for MatMulAM {
             c.row_block = row;
             let sub_a = lamellar::world
                 .alloc_one_sided_mem_region::<f32>(a.block_size * a.block_size)
-                .expect("enough memory exists");
+                ;
             get_sub_mat(&a, &sub_a).await; //this should be local copy so returns immediately
             do_gemm(&sub_a, &b, c, self.block_size);
         }
@@ -179,15 +179,15 @@ fn main() {
     let a = world
         .alloc_shared_mem_region::<f32>((m * n) / num_pes)
         .block()
-        .expect("enough memory exists");
+        ;
     let b = world
         .alloc_shared_mem_region::<f32>((n * p) / num_pes)
         .block()
-        .expect("enough memory exists");
+        ;
     let c = world
         .alloc_shared_mem_region::<f32>((m * p) / num_pes)
         .block()
-        .expect("enough memory exists");
+        ;
     // let c2 = world.alloc_shared_mem_region::<f32>((m * p) / num_pes);
     unsafe {
         let mut cnt = my_pe as f32 * ((m * n) / num_pes) as f32;
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index a3b1fb25..2f7d201d 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -145,8 +145,7 @@ fn dft_lamellar(
     let spectrum_slice = unsafe { spectrum.as_slice().unwrap() };
     let add_spec = world
         .alloc_shared_mem_region::<f64>(spectrum_slice.len())
-        .block()
-        .unwrap();
+        .block();
 
     let timer = Instant::now();
     for pe in 0..num_pes {
@@ -640,18 +639,18 @@ fn main() {
         let partial_sum = world
             .alloc_shared_mem_region::<f64>(num_pes)
             .block()
-            .expect("Enough memory should exist");
+            ;
         let partial_spectrum = world
             .alloc_shared_mem_region::<f64>(array_len)
             .block()
-            .expect("Enough memory should exist");
+            ;
         let partial_signal = world
             .alloc_shared_mem_region::<f64>(array_len)
             .block()
-            .expect("Enough memory should exist");
-        let full_signal = world.alloc_one_sided_mem_region::<f64>(global_len).expect("Enough memory should exist");
-        let full_spectrum = world.alloc_one_sided_mem_region::<f64>(global_len).expect("Enough memory should exist");
-        let magic = world.alloc_one_sided_mem_region::<f64>(num_pes).expect("Enough memory should exist");
+            ;
+        let full_signal = world.alloc_one_sided_mem_region::<f64>(global_len);
+        let full_spectrum = world.alloc_one_sided_mem_region::<f64>(global_len);
+        let magic = world.alloc_one_sided_mem_region::<f64>(num_pes);
 
         let full_spectrum_array =
             UnsafeArray::<f64>::new(world.team(), global_len, Distribution::Block).block();
diff --git a/examples/misc/ping_pong.rs b/examples/misc/ping_pong.rs
index 320e3b5c..346867bc 100644
--- a/examples/misc/ping_pong.rs
+++ b/examples/misc/ping_pong.rs
@@ -328,28 +328,23 @@ fn main() {
 
     let indices = world
         .alloc_shared_mem_region::<usize>(UPDATES_PER_CORE * world.num_threads_per_pe())
-        .block()
-        .unwrap();
+        .block();
 
     let index_send_buffers = world
         .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
-        .block()
-        .unwrap();
+        .block();
     world.barrier();
     let index_recv_buffers = world
         .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
-        .block()
-        .unwrap();
+        .block();
     world.barrier();
     let result_send_buffers = world
         .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
-        .block()
-        .unwrap();
+        .block();
     world.barrier();
     let result_recv_buffers = world
         .alloc_shared_mem_region::<usize>(buffer_size * num_pes)
-        .block()
-        .unwrap();
+        .block();
     world.barrier();
     let mut rng: StdRng = SeedableRng::seed_from_u64(my_pe as u64);
     let table_size_per_pe = 100000 * world.num_threads_per_pe();
diff --git a/examples/rdma_examples/rdma_am.rs b/examples/rdma_examples/rdma_am.rs
index 7463b2da..e35159f8 100644
--- a/examples/rdma_examples/rdma_am.rs
+++ b/examples/rdma_examples/rdma_am.rs
@@ -34,7 +34,7 @@ impl LamellarAM for RdmaAM {
         //get the original nodes data
         let local = lamellar::world
             .alloc_one_sided_mem_region::<u8>(ARRAY_LEN)
-            .expect("Enough memory should exist");
+            ;
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
         local_slice[ARRAY_LEN - 1] = lamellar::num_pes as u8;
         unsafe {
@@ -68,7 +68,7 @@ impl LamellarAM for RdmaLocalMRAM {
         );
 
         //get the original nodes data
-        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
         local_slice[ARRAY_LEN - 1] = lamellar::num_pes as u8;
         unsafe {
@@ -110,8 +110,8 @@ fn main() {
     let array = world
         .alloc_shared_mem_region::<u8>(ARRAY_LEN)
         .block()
-        .expect("Enough memory should exist");
-    let local_array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+        ;
+    let local_array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in array.as_mut_slice().unwrap() {
             *i = 255_u8;
diff --git a/examples/rdma_examples/rdma_get.rs b/examples/rdma_examples/rdma_get.rs
index 26a27cea..17ced9e1 100644
--- a/examples/rdma_examples/rdma_get.rs
+++ b/examples/rdma_examples/rdma_get.rs
@@ -22,12 +22,12 @@ fn main() {
         let array = world
             .alloc_shared_mem_region::<u8>(ARRAY_LEN)
             .block()
-            .expect("Enough memory should exist");
+            ;
         let array_slice = unsafe { array.as_slice().unwrap() }; //we can unwrap because we know array is local
                                                                 // instatiates a local array whos memory is registered with
                                                                 // the underlying network device, so that it can be used
                                                                 // as the src buffer in a put or as the dst buffer in a get
-        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
         let data_slice = unsafe { data.as_mut_slice().unwrap() }; //we can unwrap because we know data is local
         for elem in data_slice.iter_mut() {
             *elem = my_pe as u8;
diff --git a/examples/rdma_examples/rdma_put.rs b/examples/rdma_examples/rdma_put.rs
index 87e94ba8..fba7f55d 100644
--- a/examples/rdma_examples/rdma_put.rs
+++ b/examples/rdma_examples/rdma_put.rs
@@ -19,13 +19,13 @@ fn main() {
     if num_pes > 1 {
         // instatiates a shared memory region on every PE in world
         // all other pes can put/get into this region
-        let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN).block().expect("Enough memory should exist");
+        let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN).block();
         let array_slice = unsafe { array.as_slice().unwrap() }; //we can unwrap because we know array is local
 
         // instatiates a local array whos memory is registered with
         // the underlying network device, so that it can be used
         // as the src buffer in a put or as the dst buffer in a get
-        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN).expect("Enough memory should exist");
+        let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
         let data_slice = unsafe { data.as_mut_slice().unwrap() }; //we can unwrap because we know data is local
         for elem in data_slice {
             *elem = my_pe as u8;
diff --git a/src/array.rs b/src/array.rs
index 08cfda22..745015ec 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -194,6 +194,9 @@ crate::inventory::collect!(ReduceKey);
 // lamellar_impl::generate_reductions_for_type_rt!(true, u8, usize);
 // lamellar_impl::generate_ops_for_type_rt!(true, true, true, u8, usize);
 
+// lamellar_impl::generate_reductions_for_type_rt!(true, isize);
+// lamellar_impl::generate_ops_for_type_rt!(true, true, true, isize);
+
 // lamellar_impl::generate_reductions_for_type_rt!(false, f32);
 // lamellar_impl::generate_ops_for_type_rt!(false, false, false, f32);
 
@@ -302,7 +305,7 @@ impl<T: Dist> LamellarRead for &[T] {}
 impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
     /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
     fn team_from(val: &T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(1);
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
         unsafe {
             buf.as_mut_slice().expect("Data should exist on PE")[0] = val.clone();
         }
@@ -313,7 +316,7 @@ impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
     /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(1);
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
         unsafe {
             buf.as_mut_slice().expect("Data should exist on PE")[0] = val;
         }
@@ -324,7 +327,7 @@ impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(vals.len());
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -338,7 +341,7 @@ impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: &Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(vals.len());
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -352,7 +355,7 @@ impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamFrom<&[T]> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
     fn team_from(vals: &[T], team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region_or_panic(vals.len());
+        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -1051,38 +1054,6 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>>; //todo turn this into Arc<LamellarTeam>
 
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the current PE of the calling thread
-    // ///
-    // /// # One-sided Operation
-    // /// the result is returned only on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
-    // ///
-    // /// assert_eq!(world.my_pe(),array.my_pe());
-    // ///```
-    // fn my_pe(&self) -> usize;
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Return the number of PEs containing data for this array
-    // ///
-    // /// # One-sided Operation
-    // /// the result is returned only on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
-    // ///
-    // /// assert_eq!(world.num_pes(),array.num_pes());
-    // ///```
-    // fn num_pes(&self) -> usize;
-
     #[doc(alias("One-sided", "onesided"))]
     /// Return the total number of elements in this array
     ///
@@ -1110,84 +1081,12 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```no_run //assert is for 4 PEs
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<u8> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
+    /// let array = ReadOnlyArray::<usize>::new(&world,100,Distribution::Cyclic).block();
     ///
     /// assert_eq!(25,array.num_elems_local());
     ///```
     fn num_elems_local(&self) -> usize;
 
-    /// Change the distribution this array handle uses to index into the data of the array.
-    ///
-    /// This is a one-sided call and does not redistribute the actual data, it simply changes how the array is indexed for this particular handle.
-    ///
-    /// # Examples
-    ///```
-    /// use lamellar::array::prelude::*;
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
-    /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
-    ///```
-    // fn use_distribution(self, distribution: Distribution) -> Self;
-
-    // #[doc(alias = "Collective")]
-    // /// Global synchronization method which blocks calling thread until all PEs in the owning Array data have entered the barrier
-    // ///
-    // /// # Collective Operation
-    // /// Requires all PEs associated with the array to enter the barrier, otherwise deadlock will occur
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
-    // ///
-    // /// array.barrier();
-    // ///```
-    // fn barrier(&self);
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// blocks calling thread until all remote tasks (e.g. element wise operations)
-    // /// initiated by the calling PE have completed.
-    // ///
-    // /// # One-sided Operation
-    // /// this is not a distributed synchronization primitive (i.e. it has no knowledge of a Remote PEs tasks), the calling thread will only wait for tasks
-    // /// to finish that were initiated by the calling PE itself
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
-    // ///
-    // /// for i in 0..100{
-    // ///     array.add(i,1);
-    // /// }
-    // /// array.wait_all(); //block until the previous add operations have finished
-    // ///```
-    // fn wait_all(&self);
-
-    // #[doc(alias("One-sided", "onesided"))]
-    // /// Run a future to completion on the current thread
-    // ///
-    // /// This function will block the caller until the given future has completed, the future is executed within the Lamellar threadpool
-    // ///
-    // /// Users can await any future, including those returned from lamellar remote operations
-    // ///
-    // /// # One-sided Operation
-    // /// this is not a distributed synchronization primitive and only blocks the calling thread until the given future has completed on the calling PE
-    // ///
-    // /// # Examples
-    // ///```
-    // /// use lamellar::array::prelude::*;
-    // /// let world = LamellarWorldBuilder::new().build();
-    // /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
-    // ///
-    // /// let request = array.fetch_add(10,1000); //fetch index 10 and add 1000 to it
-    // /// let result = array.block_on(request); //block until am has executed
-    // /// // we also could have used world.block_on() or team.block_on()
-    // ///```
-    // fn block_on<F: Future>(&self, f: F) -> F::Output;
 
     #[doc(alias("One-sided", "onesided"))]
     /// Given a global index, calculate the PE and offset on that PE where the element actually resides.
@@ -1307,30 +1206,6 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     /// assert_eq!(index , 15);
     ///```
     fn last_global_index_for_pe(&self, pe: usize) -> Option<usize>;
-
-    // /// Returns a distributed iterator for the LamellarArray
-    // /// must be called accross all pes containing data in the array
-    // /// iteration on a pe only occurs on the data which is locally present
-    // /// with all pes iterating concurrently
-    // /// blocking: true
-    // pub fn dist_iter(&self) -> DistIter<'static, T>;
-
-    // /// Returns a distributed iterator for the LamellarArray
-    // /// must be called accross all pes containing data in the array
-    // /// iteration on a pe only occurs on the data which is locally present
-    // /// with all pes iterating concurrently
-    // pub fn dist_iter_mut(&self) -> DistIterMut<'static, T>;
-
-    // /// Returns an iterator for the LamellarArray, all iteration occurs on the PE
-    // /// where this was called, data that is not local to the PE is automatically
-    // /// copied and transferred
-    // pub fn onesided_iter(&self) -> OneSidedIter<'_, T> ;
-
-    // /// Returns an iterator for the LamellarArray, all iteration occurs on the PE
-    // /// where this was called, data that is not local to the PE is automatically
-    // /// copied and transferred, array data is buffered to more efficiently make
-    // /// use of network buffers
-    // pub fn buffered_onesided_iter(&self, buf_size: usize) -> OneSidedIter<'_, T> ;
 }
 
 /// Sub arrays are contiguous subsets of the elements of an array.
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 2e6ae755..02ddd895 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -904,9 +904,9 @@ impl<T: Dist> AtomicArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
+    /// let block_view = array.clone().use_distribution(Distribution::Block);
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         match self {
@@ -928,7 +928,7 @@ impl<T: Dist> AtomicArray<T> {
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
     /// let my_pe = world.my_pe();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Cyclic).block();
     ///
     /// let local_data = array.local_data();
     /// println!("PE{my_pe} local_data[0]: {:?}",local_data.at(0).load());
diff --git a/src/array/atomic/handle.rs b/src/array/atomic/handle.rs
index d6526569..3a2790e9 100644
--- a/src/array/atomic/handle.rs
+++ b/src/array/atomic/handle.rs
@@ -69,7 +69,7 @@ impl<T: Dist + ArrayOps + 'static> AtomicArrayHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Cyclic).block();
     pub fn block(mut self) -> AtomicArray<T> {
         self.launched = true;
         self.inner.set_launched(true);
@@ -90,7 +90,7 @@ impl<T: Dist + ArrayOps + 'static> AtomicArrayHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array_task: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// let array_task = AtomicArray::<usize>::new(&world,100,Distribution::Cyclic).spawn();
     /// // do some other work
     /// let array = array_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
diff --git a/src/array/generic_atomic/handle.rs b/src/array/generic_atomic/handle.rs
index 8502bdcb..f451d0b6 100644
--- a/src/array/generic_atomic/handle.rs
+++ b/src/array/generic_atomic/handle.rs
@@ -13,21 +13,6 @@ use pin_project::{pin_project, pinned_drop};
 #[must_use = " GenericAtomicArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project(PinnedDrop)]
 #[doc(alias = "Collective")]
-/// This is a handle representing the operation of creating a new [GenericAtomicArray].
-/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
-/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the GenericAtomicArray's team, only returning once every PE in the team has completed the call.
-///
-/// # Collective Operation
-/// Requires all PEs associated with the `GenericAtomicArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
-///
-/// # Examples
-/// ```
-/// use lamellar::array::prelude::*;
-///
-/// let world = LamellarWorldBuilder::new().build();
-///
-/// let array: GenericAtomicArray<usize> = GenericAtomicArray::new(&world,100,Distribution::Cyclic).block();
-/// ```
 pub(crate) struct GenericAtomicArrayHandle<T: Dist + ArrayOps + 'static> {
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
     pub(crate) launched: bool,
@@ -45,15 +30,7 @@ impl<T: Dist + ArrayOps + 'static> PinnedDrop for GenericAtomicArrayHandle<T> {
 }
 
 impl<T: Dist + ArrayOps + 'static> GenericAtomicArrayHandle<T> {
-    /// Used to drive creation of a new GenericAtomicArray
-    /// # Examples
-    ///
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: GenericAtomicArray<usize> = GenericAtomicArray::new(&world,100,Distribution::Cyclic).block();
-    pub fn block(mut self) -> GenericAtomicArray<T> {
+    pub(crate) fn block(mut self) -> GenericAtomicArray<T> {
         self.launched = true;
         RuntimeWarning::BlockingCall(
             "GenericAtomicArrayHandle::block",
@@ -63,20 +40,8 @@ impl<T: Dist + ArrayOps + 'static> GenericAtomicArrayHandle<T> {
         self.team.clone().block_on(self)
     }
 
-    /// This method will spawn the creation of the GenericAtomicArray on the work queue
-    ///
-    /// This function returns a handle that can be used to wait for the operation to complete
-    /// /// # Examples
-    ///
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array_task: GenericAtomicArray<usize> = GenericAtomicArray::new(&world,100,Distribution::Cyclic).spawn();
-    /// // do some other work
-    /// let array = array_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<GenericAtomicArray<T>> {
+    pub(crate) fn spawn(mut self) -> LamellarTask<GenericAtomicArray<T>> {
         self.launched = true;
         self.team.clone().spawn(self)
     }
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index d4155b4e..1243e754 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -24,7 +24,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GenericAtomicArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 555a1dac..d03208de 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -358,9 +358,9 @@ impl<T: Dist> GlobalLockArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
+    /// let array = GlobalLockArray::<usize>::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
+    /// let block_view = array.clone().use_distribution(Distribution::Block);
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         GlobalLockArray {
diff --git a/src/array/global_lock_atomic/handle.rs b/src/array/global_lock_atomic/handle.rs
index db471451..573e279e 100644
--- a/src/array/global_lock_atomic/handle.rs
+++ b/src/array/global_lock_atomic/handle.rs
@@ -80,7 +80,7 @@ impl<T: Dist + ArrayOps + 'static> GlobalLockArrayHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array_task: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// let array_task = GlobalLockArray::<usize>::new(&world,100,Distribution::Cyclic).spawn();
     /// // do some other work
     /// let array = array_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index 6e224233..dddbefcc 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -34,7 +34,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for GlobalLockArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local_tg(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index 82abb3f7..b14526db 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -512,12 +512,12 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
+    /// let array = ReadOnlyArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter()
     ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
     ///                .filter(|elem|  *elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    ///                .collect::<AtomicArray<usize>>(Distribution::Block).block();
+    ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -547,12 +547,12 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Block).block();
+    /// let array = ReadOnlyArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let req = array.dist_iter()
     ///                .map(|elem| *elem) //because of constraints of collect we need to convert from &usize to usize
     ///                .filter(|elem| * elem < 10) // (if we didnt do the previous map  we would have needed to do **elem)
-    ///                .collect::<AtomicArray<usize>>(Distribution::Block).block();
+    ///                .collect::<AtomicArray<usize>>(Distribution::Block);
     /// let new_array = array.block_on(req); //wait on the collect request to get the new array
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -590,7 +590,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -603,7 +603,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic).block();
+    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
@@ -638,7 +638,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -651,7 +651,7 @@ pub trait DistributedIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic).block();
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it "]
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index cfa84852..1cc6194b 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -493,10 +493,10 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic).block();
+    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect::<ReadOnlyArray<usize>>(Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -519,10 +519,10 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// let array_clone = array.clone();
-    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Schedule::WorkStealing,Distribution::Cyclic).block();
+    /// let req = array.local_iter().map(|elem|elem.load()).filter(|elem| elem % 2 == 0).collect_with_schedule::<ReadOnlyArray<usize>>(Schedule::WorkStealing,Distribution::Cyclic);
     /// let new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -560,7 +560,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -573,7 +573,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic).block();
+    ///             .collect_async::<ReadOnlyArray<usize>,_>(Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
@@ -608,7 +608,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     /// use lamellar::array::prelude::*;
     /// // initialize a world and an atomic array
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Block).block();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
     ///
     /// // clone the array; this doesn't duplicate the underlying
     /// // data but it does create a second pointer that we can
@@ -621,7 +621,7 @@ pub trait LocalIterator: SyncSend + InnerIter + 'static {
     ///         move |elem|
     ///         array_clone
     ///             .fetch_add(elem.load(),1000))
-    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic).block();
+    ///             .collect_async_with_schedule::<ReadOnlyArray<usize>,_>(Schedule::Dynamic, Distribution::Cyclic);
     /// let _new_array = array.block_on(req);
     ///```
     #[must_use = "this iteration adapter is lazy and does nothing unless awaited. Either await the returned future, or call 'spawn()' or 'block()' on it."]
diff --git a/src/array/iterator/one_sided_iterator.rs b/src/array/iterator/one_sided_iterator.rs
index 14ada186..264416ac 100644
--- a/src/array/iterator/one_sided_iterator.rs
+++ b/src/array/iterator/one_sided_iterator.rs
@@ -430,7 +430,7 @@ impl<'a, T: Dist + 'static, A: LamellarArrayInternalGet<T>> OneSidedIter<'a, T,
         team: Pin<Arc<LamellarTeamRT>>,
         buf_size: usize,
     ) -> OneSidedIter<'a, T, A> {
-        let buf_0 = team.alloc_one_sided_mem_region_or_panic(buf_size);
+        let buf_0 = team.alloc_one_sided_mem_region(buf_size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
         // but safe with respect to the buf_0 as self is the only reference
 
diff --git a/src/array/iterator/one_sided_iterator/chunks.rs b/src/array/iterator/one_sided_iterator/chunks.rs
index 2beadc80..40f53660 100644
--- a/src/array/iterator/one_sided_iterator/chunks.rs
+++ b/src/array/iterator/one_sided_iterator/chunks.rs
@@ -50,7 +50,7 @@ where
     ) -> (OneSidedMemoryRegion<I::ElemType>, ArrayRdmaHandle) {
         // println!(" get chunk of len: {:?}", size);
         let mem_region: OneSidedMemoryRegion<I::ElemType> =
-            array.team_rt().alloc_one_sided_mem_region_or_panic(size);
+            array.team_rt().alloc_one_sided_mem_region(size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
         // but safe with respect to the mem_region as this is the only reference
         let mut req = unsafe { array.internal_get(index, &mem_region) };
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index a2d684cc..7f418f52 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -358,9 +358,9 @@ impl<T: Dist> LocalLockArray<T> {
     ///```
     /// use lamellar::array::prelude::*;
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
+    /// let array = LocalLockArray::<usize>::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
+    /// let block_view = array.clone().use_distribution(Distribution::Block);
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         LocalLockArray {
diff --git a/src/array/local_lock_atomic/handle.rs b/src/array/local_lock_atomic/handle.rs
index 3122cbbd..582b61d2 100644
--- a/src/array/local_lock_atomic/handle.rs
+++ b/src/array/local_lock_atomic/handle.rs
@@ -78,7 +78,7 @@ impl<T: Dist + ArrayOps + 'static> LocalLockArrayHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array_task: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// let array_task = LocalLockArray::<usize>::new(&world,100,Distribution::Cyclic).spawn();
     /// // do some other work
     /// let array = array_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index 319291dd..3b3cf9d1 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -27,7 +27,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for LocalLockArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/native_atomic/handle.rs b/src/array/native_atomic/handle.rs
index fb985636..b049348e 100644
--- a/src/array/native_atomic/handle.rs
+++ b/src/array/native_atomic/handle.rs
@@ -13,21 +13,6 @@ use pin_project::{pin_project, pinned_drop};
 #[must_use = " NativeAtomicArray 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
 #[pin_project(PinnedDrop)]
 #[doc(alias = "Collective")]
-/// This is a handle representing the operation of creating a new [NativeAtomicArray].
-/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
-/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the NativeAtomicArray's team, only returning once every PE in the team has completed the call.
-///
-/// # Collective Operation
-/// Requires all PEs associated with the `NativeAtomicArray` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
-///
-/// # Examples
-/// ```
-/// use lamellar::array::prelude::*;
-///
-/// let world = LamellarWorldBuilder::new().build();
-///
-/// let array: NativeAtomicArray<usize> = NativeAtomicArray::new(&world,100,Distribution::Cyclic).block();
-/// ```
 pub(crate) struct NativeAtomicArrayHandle<T: Dist + ArrayOps + 'static> {
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
     pub(crate) launched: bool,
@@ -45,15 +30,7 @@ impl<T: Dist + ArrayOps + 'static> PinnedDrop for NativeAtomicArrayHandle<T> {
 }
 
 impl<T: Dist + ArrayOps + 'static> NativeAtomicArrayHandle<T> {
-    /// Used to drive creation of a new NativeAtomicArray
-    /// # Examples
-    ///
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array: NativeAtomicArray<usize> = NativeAtomicArray::new(&world,100,Distribution::Cyclic).block();
-    pub fn block(mut self) -> NativeAtomicArray<T> {
+    pub(crate) fn block(mut self) -> NativeAtomicArray<T> {
         self.launched = true;
         RuntimeWarning::BlockingCall(
             "NativeAtomicArrayHandle::block",
@@ -63,20 +40,9 @@ impl<T: Dist + ArrayOps + 'static> NativeAtomicArrayHandle<T> {
         self.team.clone().block_on(self)
     }
 
-    /// This method will spawn the creation of the NativeAtomicArray on the work queue
-    ///
-    /// This function returns a handle that can be used to wait for the operation to complete
-    /// /// # Examples
-    ///
-    ///```
-    /// use lamellar::array::prelude::*;
-    ///
-    /// let world = LamellarWorldBuilder::new().build();
-    /// let array_task: NativeAtomicArray<usize> = NativeAtomicArray::new(&world,100,Distribution::Cyclic).spawn();
-    /// // do some other work
-    /// let array = array_task.block();
+   
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub fn spawn(mut self) -> LamellarTask<NativeAtomicArray<T>> {
+    pub(crate) fn spawn(mut self) -> LamellarTask<NativeAtomicArray<T>> {
         self.launched = true;
         self.team.clone().spawn(self)
     }
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index 6d1cdfde..84553794 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -24,7 +24,7 @@ impl<T: Dist> LamellarArrayInternalGet<T> for NativeAtomicArray<T> {
         }
     }
     unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region_or_panic(1);
+        let buf: OneSidedMemoryRegion<T> = self.array.team_rt().alloc_one_sided_mem_region(1);
         let req = self.exec_am_local(InitGetAm {
             array: self.clone(),
             index: index,
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 9e359de1..25a90c94 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -103,7 +103,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
+    /// let block_view = array.clone().use_distribution(Distribution::Block);
     ///```
     pub fn use_distribution(self, distribution: Distribution) -> Self {
         ReadOnlyArray {
diff --git a/src/array/read_only/handle.rs b/src/array/read_only/handle.rs
index 95691682..ccd17e36 100644
--- a/src/array/read_only/handle.rs
+++ b/src/array/read_only/handle.rs
@@ -72,7 +72,7 @@ impl<T: Dist + ArrayOps + 'static> ReadOnlyArrayHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array_task: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// let array_task = ReadOnlyArray::<usize>::new(&world,100,Distribution::Cyclic).spawn();
     /// // do some other work
     /// let array = array_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index 7761c9aa..f1c42eba 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -282,7 +282,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// let world = LamellarWorldBuilder::new().build();
     /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     /// // do something interesting... or not
-    /// let block_view = array.clone().use_distribution(Distribution::Block).block();
+    /// let block_view = array.clone().use_distribution(Distribution::Block);
     ///```
     pub fn use_distribution(mut self, distribution: Distribution) -> Self {
         self.inner.distribution = distribution;
diff --git a/src/array/unsafe/handle.rs b/src/array/unsafe/handle.rs
index 58773079..2022821b 100644
--- a/src/array/unsafe/handle.rs
+++ b/src/array/unsafe/handle.rs
@@ -72,7 +72,7 @@ impl<T: Dist + ArrayOps + 'static> UnsafeArrayHandle<T> {
     /// use lamellar::array::prelude::*;
     ///
     /// let world = LamellarWorldBuilder::new().build();
-    /// let array_task: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).spawn();
+    /// let array_task = UnsafeArray::<usize>::new(&world,100,Distribution::Cyclic).spawn();
     /// // do some other work
     /// let array = array_task.block();
     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 8d62931c..6254bf28 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -173,7 +173,7 @@ impl<T: Dist> UnsafeArray<T> {
                     .inner
                     .data
                     .team
-                    .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
+                    .alloc_one_sided_mem_region::<T>(num_elems_pe);
                 unsafe {
                     for i in 0..std::cmp::min(buf.len(), num_pes) {
                         let mut k = 0;
@@ -200,7 +200,7 @@ impl<T: Dist> UnsafeArray<T> {
                         .inner
                         .data
                         .team
-                        .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
+                        .alloc_one_sided_mem_region::<T>(num_elems_pe);
                     let mut k = 0;
                     let pe = (start_pe + i) % num_pes;
                     // let offset = global_index / num_pes + overflow;
@@ -248,7 +248,7 @@ impl<T: Dist> UnsafeArray<T> {
                             .inner
                             .data
                             .team
-                            .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
+                            .alloc_one_sided_mem_region::<T>(num_elems_pe);
                         let rem = buf.len() % num_pes;
                         // let temp_buf: LamellarMemoryRegion<T> = buf.team_into(&self.inner.data.team);
                         for i in 0..std::cmp::min(buf.len(), num_pes) {
@@ -290,7 +290,7 @@ impl<T: Dist> UnsafeArray<T> {
                         .inner
                         .data
                         .team
-                        .alloc_one_sided_mem_region_or_panic::<T>(num_elems_pe);
+                        .alloc_one_sided_mem_region::<T>(num_elems_pe);
                     let pe = (start_pe + i) % num_pes;
                     let offset = global_index / num_pes + overflow;
                     let num_elems = (num_elems_pe - 1) + if i < rem { 1 } else { 0 };
@@ -639,7 +639,7 @@ impl<T: Dist> UnsafeArray<T> {
     }
 
     pub(crate) unsafe fn internal_at(&self, index: usize) -> ArrayRdmaAtHandle<T> {
-        let buf: OneSidedMemoryRegion<T> = self.team_rt().alloc_one_sided_mem_region_or_panic(1);
+        let buf: OneSidedMemoryRegion<T> = self.team_rt().alloc_one_sided_mem_region(1);
         self.blocking_get(index, &buf);
         ArrayRdmaAtHandle {
             array: self.as_lamellar_byte_array(),
diff --git a/src/darc.rs b/src/darc.rs
index 797291c3..6e781908 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1483,7 +1483,7 @@ impl<T: Send + Sync> Darc<T> {
     ///
     /// let five_handle = Darc::new(&world,5);
     /// let five_as_localdarc = world.block_on(async move {
-    ///     let five = five_handle.await;
+    ///     let five = five_handle.await.expect("PE in world team");
     ///     five.into_localrw().await
     /// });
     /// ```
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index e1446d75..db26b706 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -7,7 +7,7 @@ use crate::lamellar_arch::{GlobalArch, IdError, LamellarArch, LamellarArchEnum,
 use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_request::*;
 use crate::lamellar_world::LamellarWorld;
-use crate::memregion::handle::SharedMemoryRegionHandle;
+use crate::memregion::handle::{FallibleSharedMemoryRegionHandle, SharedMemoryRegionHandle};
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion, shared::SharedMemoryRegion, Dist, LamellarMemoryRegion,
     MemoryRegion, RemoteMemoryRegion,
@@ -573,7 +573,8 @@ impl ActiveMessaging for Arc<LamellarTeam> {
 
 impl RemoteMemoryRegion for Arc<LamellarTeam> {
     //#[tracing::instrument(skip_all)]
-    fn alloc_shared_mem_region<T: Dist>(&self, size: usize) -> SharedMemoryRegionHandle<T> {
+
+    fn try_alloc_shared_mem_region<T: Dist>(&self, size: usize) -> FallibleSharedMemoryRegionHandle<T> {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         // self.team.barrier.barrier();
@@ -589,29 +590,53 @@ impl RemoteMemoryRegion for Arc<LamellarTeam> {
         // self.team.barrier.barrier();
         mr
     }
+    fn alloc_shared_mem_region<T: Dist>(&self, size: usize) -> SharedMemoryRegionHandle<T> {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+
+        // self.team.barrier.barrier();
+        let mr = if self.team.num_world_pes == self.team.num_pes {
+            SharedMemoryRegion::new(size, self.team.clone(), AllocationType::Global)
+        } else {
+            SharedMemoryRegion::new(
+                size,
+                self.team.clone(),
+                AllocationType::Sub(self.team.arch.team_iter().collect::<Vec<usize>>()),
+            )
+        };
+        // self.team.barrier.barrier();
+        mr
+    }
+
+    fn try_alloc_one_sided_mem_region<T: Dist>(
+        &self,
+        size: usize,
+    ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
+        assert!(self.panic.load(Ordering::SeqCst) == 0);
+
+        OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone())
+    }
 
     //#[tracing::instrument(skip_all)]
     fn alloc_one_sided_mem_region<T: Dist>(
         &self,
         size: usize,
-    ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
+    ) -> OneSidedMemoryRegion<T> {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
-        let lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
-        // while let Err(_err) = lmr {
-        //     std::thread::yield_now();
-        //     // println!(
-        //     //     "out of Lamellar mem trying to alloc new pool {:?} {:?}",
-        //     //     size,
-        //     //     std::mem::size_of::<T>()
-        //     // );
-        //     self.team
-        //         .lamellae
-        //         .alloc_pool(size * std::mem::size_of::<T>());
-        //     lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
-        // }
-        // lmr.expect("out of memory")
-        lmr
+        let mut lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
+        while let Err(_err) = lmr {
+            std::thread::yield_now();
+            // println!(
+            //     "out of Lamellar mem trying to alloc new pool {:?} {:?}",
+            //     size,
+            //     std::mem::size_of::<T>()
+            // );
+            self.team
+                .lamellae
+                .alloc_pool(size * std::mem::size_of::<T>());
+            lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
+        }
+        lmr.expect("out of memory")
     }
 }
 
@@ -2189,26 +2214,11 @@ impl LamellarTeamRT {
     /// * `size` - number of elements of T to allocate a memory region for -- (not size in bytes)
     ///
     //#[tracing::instrument(skip_all)]
-    pub fn alloc_one_sided_mem_region<T: Dist>(
+    pub fn try_alloc_one_sided_mem_region<T: Dist>(
         self: &Pin<Arc<LamellarTeamRT>>,
         size: usize,
     ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
-        // let lmr: OneSidedMemoryRegion<T> =
-        //     OneSidedMemoryRegion::new(size, self, self.lamellae.clone()).into();
-        // lmr
-        let  lmr = OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone());
-        // while let Err(_err) = lmr {
-        //     std::thread::yield_now();
-        //     // println!(
-        //     //     "out of Lamellar mem trying to alloc new pool {:?} {:?}",
-        //     //     size,
-        //     //     std::mem::size_of::<T>()
-        //     // );
-        //     self.lamellae.alloc_pool(size * std::mem::size_of::<T>());
-        //     lmr = OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone());
-        // }
-        // lmr.expect("out of memory")
-        lmr
+        OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone())
     }
 
      /// allocate a local memory region from the asymmetric heap
@@ -2218,13 +2228,10 @@ impl LamellarTeamRT {
     /// * `size` - number of elements of T to allocate a memory region for -- (not size in bytes)
     ///
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn alloc_one_sided_mem_region_or_panic<T: Dist>(
+    pub(crate) fn alloc_one_sided_mem_region<T: Dist>(
         self: &Pin<Arc<LamellarTeamRT>>,
         size: usize,
     ) -> OneSidedMemoryRegion<T>{
-        // let lmr: OneSidedMemoryRegion<T> =
-        //     OneSidedMemoryRegion::new(size, self, self.lamellae.clone()).into();
-        // lmr
         let mut lmr = OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone());
         while let Err(_err) = lmr {
             std::thread::yield_now();
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index c9aea9ff..81c2866b 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -3,7 +3,7 @@ use crate::lamellae::{create_lamellae, Backend, Lamellae, LamellaeComm, Lamellae
 use crate::lamellar_arch::LamellarArch;
 use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
-use crate::memregion::handle::SharedMemoryRegionHandle;
+use crate::memregion::handle::{FallibleSharedMemoryRegionHandle,SharedMemoryRegionHandle};
 use crate::memregion::{
     one_sided::OneSidedMemoryRegion,  Dist, RemoteMemoryRegion,
 };
@@ -133,17 +133,29 @@ impl ActiveMessaging for LamellarWorld {
 }
 
 impl RemoteMemoryRegion for LamellarWorld {
+    //#[tracing::instrument(skip_all)]
+    fn try_alloc_shared_mem_region<T: Dist>(&self, size: usize) -> FallibleSharedMemoryRegionHandle<T> {
+        self.team.try_alloc_shared_mem_region::<T>(size)
+    }
+
     //#[tracing::instrument(skip_all)]
     fn alloc_shared_mem_region<T: Dist>(&self, size: usize) -> SharedMemoryRegionHandle<T> {
-        self.barrier();
         self.team.alloc_shared_mem_region::<T>(size)
     }
 
     //#[tracing::instrument(skip_all)]
-    fn alloc_one_sided_mem_region<T: Dist>(
+    fn try_alloc_one_sided_mem_region<T: Dist>(
         &self,
         size: usize,
     ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
+        self.team.try_alloc_one_sided_mem_region::<T>(size)
+    }
+
+    //#[tracing::instrument(skip_all)]
+    fn alloc_one_sided_mem_region<T: Dist>(
+        &self,
+        size: usize,
+    ) -> OneSidedMemoryRegion<T> {
         self.team.alloc_one_sided_mem_region::<T>(size)
     }
 }
diff --git a/src/memregion.rs b/src/memregion.rs
index 05d47700..5851ed23 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -28,7 +28,7 @@ pub(crate) mod one_sided;
 pub use one_sided::OneSidedMemoryRegion;
 
 pub(crate) mod handle;
-use handle::SharedMemoryRegionHandle;
+use handle::{FallibleSharedMemoryRegionHandle, SharedMemoryRegionHandle};
 
 use enum_dispatch::enum_dispatch;
 
@@ -287,7 +287,7 @@ pub trait RegisteredMemoryRegion<T: Dist> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000);
+    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000).block();
     /// assert_eq!(mem_region.len(),1000);
     ///```
     fn len(&self) -> usize;
@@ -313,7 +313,7 @@ pub trait RegisteredMemoryRegion<T: Dist> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000);
+    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000).block();
     /// let slice = unsafe{mem_region.as_slice().expect("PE is part of the world team")};
     ///```
     unsafe fn as_slice(&self) -> MemResult<&[T]>;
@@ -335,7 +335,7 @@ pub trait RegisteredMemoryRegion<T: Dist> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000);
+    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000).block();
     /// let val = unsafe{mem_region.at(999).expect("PE is part of the world team")};
     ///```
     unsafe fn at(&self, index: usize) -> MemResult<&T>;
@@ -357,7 +357,7 @@ pub trait RegisteredMemoryRegion<T: Dist> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000);
+    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000).block();
     /// let slice =unsafe { mem_region.as_mut_slice().expect("PE is part of the world team")};
     ///```
     unsafe fn as_mut_slice(&self) -> MemResult<&mut [T]>;
@@ -379,7 +379,7 @@ pub trait RegisteredMemoryRegion<T: Dist> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000);
+    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000).block();
     /// let ptr = unsafe { mem_region.as_ptr().expect("PE is part of the world team")};
     ///```
     unsafe fn as_ptr(&self) -> MemResult<*const T>;
@@ -401,7 +401,7 @@ pub trait RegisteredMemoryRegion<T: Dist> {
     ///
     /// let world = LamellarWorldBuilder::new().build();
     ///
-    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000);
+    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000).block();
     /// let ptr = unsafe { mem_region.as_mut_ptr().expect("PE is part of the world team")};
     ///```
     unsafe fn as_mut_ptr(&self) -> MemResult<*mut T>;
@@ -440,7 +440,7 @@ pub trait SubRegion<T: Dist> {
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     ///
-    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(100);
+    /// let mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(100).block();
     ///
     /// let sub_region = mem_region.sub_region(30..70);
     ///```
@@ -476,8 +476,8 @@ pub trait MemoryRegionRDMA<T: Dist> {
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     ///
-    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10);
-    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10);
+    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10).block();
+    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10).block();
     /// unsafe{ for elem in dst_mem_region.as_mut_slice().expect("PE in world team") {*elem = num_pes;}}
     /// unsafe{ for elem in src_mem_region.as_mut_slice().expect("PE in world team") {*elem = my_pe;}}
     ///
@@ -516,8 +516,8 @@ pub trait MemoryRegionRDMA<T: Dist> {
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     ///
-    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10);
-    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10);
+    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10).block();
+    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10).block();
     /// unsafe{ for elem in dst_mem_region.as_mut_slice().expect("PE in world team") {*elem = num_pes;}}
     /// unsafe{ for elem in src_mem_region.as_mut_slice().expect("PE in world team") {*elem = my_pe;}}
     ///
@@ -559,8 +559,8 @@ pub trait MemoryRegionRDMA<T: Dist> {
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     ///
-    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10);
-    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10);
+    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10).block();
+    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10).block();
     /// unsafe{ for elem in dst_mem_region.as_mut_slice().expect("PE in world team") {*elem = num_pes;}}
     /// unsafe{ for elem in src_mem_region.as_mut_slice().expect("PE in world team") {*elem = my_pe;}}
     ///
@@ -599,8 +599,8 @@ pub trait MemoryRegionRDMA<T: Dist> {
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     ///
-    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10);
-    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10);
+    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10).block();
+    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10).block();
     ///
     /// unsafe{ for elem in src_mem_region.as_mut_slice().expect("PE in world team") {*elem = my_pe;}}
     /// unsafe{ for elem in dst_mem_region.as_mut_slice().expect("PE in world team") {*elem = num_pes;}}
@@ -647,8 +647,8 @@ pub trait MemoryRegionRDMA<T: Dist> {
     /// let my_pe = world.my_pe();
     /// let num_pes = world.num_pes();
     ///
-    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10);
-    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10);
+    /// let src_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(10).block();
+    /// let dst_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(num_pes*10).block();
     ///
     /// unsafe{ for elem in src_mem_region.as_mut_slice().expect("PE in world team") {*elem = my_pe;}}
     /// unsafe{ for elem in dst_mem_region.as_mut_slice().expect("PE in world team") {*elem = num_pes;}}
@@ -1214,6 +1214,14 @@ pub trait RemoteMemoryRegion {
     #[doc(alias = "Collective")]
     /// Allocate a shared memory region from the asymmetric heap.
     /// There will be `size` number of `T` elements on each PE.
+    /// 
+    /// Note: If there is not enough memory in the lamellar heap on the calling PE
+    /// this call will trigger a "heap grow" operation (initiated and handled by the runtime),
+    /// this behavior can be disabled by setting the env variable "LAMELLAR_HEAP_MODE=static",
+    /// in which case this call will cause a panic if there is not enough memory.
+    /// 
+    /// Alternatively, you can use the `try_alloc_shared_mem_region` method which returns
+    /// a `Result` and allows you to handle the error case when there is not enough memory.
     ///
     /// # Collective Operation
     /// Requires all PEs associated with the `array` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
@@ -1223,11 +1231,32 @@ pub trait RemoteMemoryRegion {
         size: usize,
     ) -> SharedMemoryRegionHandle<T>;
 
+    #[doc(alias = "Collective")]
+    /// Allocate a shared memory region from the asymmetric heap.
+    /// There will be `size` number of `T` elements on each PE.
+    ///
+    /// # Collective Operation
+    /// Requires all PEs associated with the `array` to enter the call otherwise deadlock will occur (i.e. team barriers are being called internally)
+    ///
+    fn try_alloc_shared_mem_region<T: Dist + std::marker::Sized>(
+        &self,
+        size: usize,
+    ) -> FallibleSharedMemoryRegionHandle<T>;
+
+
     #[doc(alias("One-sided", "onesided"))]
     /// Allocate a one-sided memory region from the internal lamellar heap.
     /// This region only exists on the calling PE, but the returned handle can be
     /// sent to other PEs allowing remote access to the region.
     /// There will be `size` number of `T` elements on the calling PE.
+    /// 
+    /// Note: If there is not enough memory in the lamellar heap on the calling PE
+    /// this call will trigger a "heap grow" operation (initiated and handled by the runtime),
+    /// this behavior can be disabled by setting the env variable "LAMELLAR_HEAP_MODE=static",
+    /// in which case this call will cause a panic if there is not enough memory.
+    /// 
+    /// Alternatively, you can use the `try_alloc_one_sided_mem_region` method which returns
+    /// a `Result` and allows you to handle the error case when there is not enough memory.
     ///
     /// # One-sided Operation
     /// the calling PE will allocate the memory region locally, without intervention from the other PEs.
@@ -1235,6 +1264,20 @@ pub trait RemoteMemoryRegion {
     fn alloc_one_sided_mem_region<T: Dist + std::marker::Sized>(
         &self,
         size: usize,
+    ) -> OneSidedMemoryRegion<T>;
+
+    #[doc(alias("One-sided", "onesided"))]
+    /// Allocate a one-sided memory region from the internal lamellar heap.
+    /// This region only exists on the calling PE, but the returned handle can be
+    /// sent to other PEs allowing remote access to the region.
+    /// There will be `size` number of `T` elements on the calling PE.
+    ///
+    /// # One-sided Operation
+    /// the calling PE will allocate the memory region locally, without intervention from the other PEs.
+    ///
+    fn try_alloc_one_sided_mem_region<T: Dist + std::marker::Sized>(
+        &self,
+        size: usize,
     ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error>;
 }
 
diff --git a/src/memregion/handle.rs b/src/memregion/handle.rs
new file mode 100644
index 00000000..4e7f4626
--- /dev/null
+++ b/src/memregion/handle.rs
@@ -0,0 +1,257 @@
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use super::SharedMemoryRegion;
+use crate::scheduler::LamellarTask;
+use crate::warnings::RuntimeWarning;
+use crate::{Dist, LamellarTeamRT};
+
+use futures_util::{ Future};
+use pin_project::{pin_project, pinned_drop};
+
+#[must_use = " SharedMemoryRegion 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [SharedMemoryRegion].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the SharedMemoryRegion's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `SharedMemoryRegion` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::memregion::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let memregion: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(100).block();
+/// ```
+pub struct FallibleSharedMemoryRegionHandle<T: Dist> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future:
+        Pin<Box<dyn Future<Output = Result<SharedMemoryRegion<T>, anyhow::Error>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for FallibleSharedMemoryRegionHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a FallibleSharedMemoryRegionHandle").print();
+        }
+    }
+}
+
+impl<T: Dist> FallibleSharedMemoryRegionHandle<T> {
+    /// Used to drive creation of a new SharedMemoryRegion
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::memregion::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let memregion: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(100).block();
+    pub fn block(mut self) -> Result<SharedMemoryRegion<T>, anyhow::Error> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "SharedMemoryRegionHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the SharedMemoryRegion on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::memregion::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let memregion_task = world.alloc_shared_mem_region::<usize>(100).spawn();
+    /// // do some other work
+    /// let memregion = memregion_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<Result<SharedMemoryRegion<T>, anyhow::Error>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist> Future for FallibleSharedMemoryRegionHandle<T> {
+    type Output = Result<SharedMemoryRegion<T>, anyhow::Error>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        this.creation_future.as_mut().poll(cx)
+    }
+}
+
+#[must_use = " SharedMemoryRegion 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of creating a new [SharedMemoryRegion].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the SharedMemoryRegion's team, only returning once every PE in the team has completed the call.
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `SharedMemoryRegion` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::memregion::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let memregion: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(100).block();
+/// ```
+pub struct SharedMemoryRegionHandle<T: Dist> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) creation_future:
+        Pin<Box<dyn Future<Output = SharedMemoryRegion<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for SharedMemoryRegionHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a SharedMemoryRegionHandle").print();
+        }
+    }
+}
+
+impl<T: Dist> SharedMemoryRegionHandle<T> {
+    /// Used to drive creation of a new SharedMemoryRegion
+    /// # Examples
+    ///
+    ///```
+    /// use lamellar::memregion::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let memregion: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(100).block();
+    pub fn block(mut self) -> SharedMemoryRegion<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "SharedMemoryRegionHandle::block",
+            "<handle>.spawn() or<handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the creation of the SharedMemoryRegion on the work queue
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// /// # Examples
+    ///
+    ///```
+    /// use lamellar::memregion::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let memregion_task = world.alloc_shared_mem_region::<usize>(100).spawn();
+    /// // do some other work
+    /// let memregion = memregion_task.block();
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<SharedMemoryRegion<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist> Future for SharedMemoryRegionHandle<T> {
+    type Output = SharedMemoryRegion<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        this.creation_future.as_mut().poll(cx)
+    }
+}
+
+// #[must_use = " OneSidedMemoryRegion 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+// #[pin_project(PinnedDrop)]
+// #[doc(alias = "Collective")]
+// /// This is a handle representing the operation of creating a new [OneSidedMemoryRegion].
+// /// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+// /// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the OneSidedMemoryRegion's team, only returning once every PE in the team has completed the call.
+// ///
+// /// # Collective Operation
+// /// Requires all PEs associated with the `OneSidedMemoryRegion` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+// ///
+// /// # Examples
+// /// ```
+// /// use lamellar::array::prelude::*;
+// ///
+// /// let world = LamellarWorldBuilder::new().build();
+// ///
+// /// let array: OneSidedMemoryRegion<usize> = OneSidedMemoryRegion::new(&world,100).block();
+// /// ```
+// pub(crate) struct OneSidedMemoryRegionHandle<T: Dist> {
+//     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+//     pub(crate) launched: bool,
+//     #[pin]
+//     pub(crate) creation_future:
+//         Pin<Box<dyn Future<Output = Result<OneSidedMemoryRegionHandle<T>, anyhow::Error>> + Send>>,
+// }
+
+// #[pinned_drop]
+// impl<T: Dist> PinnedDrop for OneSidedMemoryRegionHandle<T> {
+//     fn drop(self: Pin<&mut Self>) {
+//         if !self.launched {
+//             RuntimeWarning::DroppedHandle("a OneSidedMemoryRegionHandle").print();
+//         }
+//     }
+// }
+
+// impl<T: Dist> OneSidedMemoryRegionHandle<T> {
+//     /// Used to drive creation of a new OneSidedMemoryRegion
+//     /// # Examples
+//     ///
+//     ///```
+//     /// use lamellar::array::prelude::*;
+//     ///
+//     /// let world = LamellarWorldBuilder::new().build();
+//     /// let array: OneSidedMemoryRegion<usize> = OneSidedMemoryRegion::new(&world,100).block();
+//     pub fn block(mut self) -> Result<OneSidedMemoryRegionHandle<T>, anyhow::Error> {
+//         self.launched = true;
+//         RuntimeWarning::BlockingCall(
+//             "OneSidedMemoryRegionHandle::block",
+//             "<handle>.spawn() or<handle>.await",
+//         )
+//         .print();
+//         self.team.clone().block_on(self)
+//     }
+
+//     /// This method will spawn the creation of the OneSidedMemoryRegion on the work queue
+//     ///
+//     /// This function returns a handle that can be used to wait for the operation to complete
+//     /// /// # Examples
+//     ///
+//     ///```
+//     /// use lamellar::array::prelude::*;
+//     ///
+//     /// let world = LamellarWorldBuilder::new().build();
+//     /// let array_task: OneSidedMemoryRegion<usize> = OneSidedMemoryRegion::new(&world,100).spawn();
+//     /// // do some other work
+//     /// let array = array_task.block();
+//     #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+//     pub fn spawn(mut self) -> LamellarTask<Result<OneSidedMemoryRegionHandle<T>, anyhow::Error>> {
+//         self.launched = true;
+//         self.team.clone().spawn(self)
+//     }
+// }
+
+// impl<T: Dist> Future for OneSidedMemoryRegionHandle<T> {
+//     type Output = Result<OneSidedMemoryRegionHandle<T>, anyhow::Error>;
+//     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+//         self.launched = true;
+//         let mut this = self.project();
+//         this.creation_future.as_mut().poll(cx)
+//     }
+// }
diff --git a/src/memregion/shared.rs b/src/memregion/shared.rs
index da2662ff..f43fd2d9 100644
--- a/src/memregion/shared.rs
+++ b/src/memregion/shared.rs
@@ -36,7 +36,7 @@ use std::ops::Bound;
 ///
 /// let world = LamellarWorldBuilder::new().build();
 ///
-/// let world_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000);
+/// let world_mem_region: SharedMemoryRegion<usize> = world.alloc_shared_mem_region(1000).block();
 /// ```
 #[derive(serde::Serialize, serde::Deserialize, Clone)]
 pub struct SharedMemoryRegion<T: Dist> {
@@ -93,22 +93,52 @@ impl<T: Dist> SharedMemoryRegion<T> {
     //     SharedMemoryRegion::try_new(size, team, alloc).expect("Out of memory")
     // }
 
-    pub(crate) fn try_new(
+    pub(crate) fn new(
         size: usize,
         team: Pin<Arc<LamellarTeamRT>>,
         alloc: AllocationType,
     ) -> SharedMemoryRegionHandle<T> {
         // println!("creating new shared mem region {:?} {:?}",size,alloc);
 
-        // Ok(SharedMemoryRegion {
-        //     mr: Darc::try_new(team.clone(), mr, crate::darc::DarcMode::Darc)
-        //         .expect("memregions can only be created on a member of the team"),
-        //     sub_region_offset: 0,
-        //     sub_region_size: size,
-        //     phantom: PhantomData,
-        // })
-
         SharedMemoryRegionHandle {
+            team: team.clone(),
+            launched: false,
+            creation_future: Box::pin(async move {
+                team.async_barrier().await;
+                let mut mr_t = 
+                    MemoryRegion::<T>::try_new(size, team.lamellae.clone(), alloc.clone());
+                while let Err(e) = mr_t {
+                    async_std::task::yield_now().await;
+                    team.lamellae.alloc_pool(size * std::mem::size_of::<T>());
+                    mr_t = MemoryRegion::try_new(size, team.lamellae.clone(), alloc.clone());
+                }
+
+                let mr = unsafe { mr_t.expect("enough memory should have been allocated").to_base::<u8>() };
+                SharedMemoryRegion {
+                    mr: Darc::async_try_new_with_drop(
+                        team.clone(),
+                        mr,
+                        crate::darc::DarcMode::Darc,
+                        None,
+                    )
+                    .await
+                    .expect("memregions can only be created on a member of the team"),
+                    sub_region_offset: 0,
+                    sub_region_size: size,
+                    phantom: PhantomData,
+                }
+            }),
+        }
+    }
+
+    pub(crate) fn try_new(
+        size: usize,
+        team: Pin<Arc<LamellarTeamRT>>,
+        alloc: AllocationType,
+    ) -> FallibleSharedMemoryRegionHandle<T> {
+        // println!("creating new shared mem region {:?} {:?}",size,alloc);
+
+        FallibleSharedMemoryRegionHandle {
             team: team.clone(),
             launched: false,
             creation_future: Box::pin(async move {
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 4312e379..250c9e45 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -426,7 +426,7 @@ macro_rules! input_test {
         // LMR<T>------------------------------
 
         unsafe {
-            let lmr = world.alloc_one_sided_mem_region(array.len()).unwrap();
+            let lmr = world.alloc_one_sided_mem_region(array.len());
             let slice = lmr.as_mut_slice().unwrap();
             for i in 0..array.len() {
                 slice[i] = i;
@@ -438,7 +438,7 @@ macro_rules! input_test {
 
         // SMR<T>------------------------------
         unsafe {
-            let smr = world.alloc_shared_mem_region(array.len()).block().unwrap();
+            let smr = world.alloc_shared_mem_region(array.len()).block();
 
             let slice = smr.as_mut_slice().unwrap();
             for i in 0..array.len() {
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index 8a5c80cd..fbded8e5 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -524,7 +524,7 @@ macro_rules! input_test{
             // scoped &LMR<T>------------------------------
             let mut reqs = vec![];
             unsafe {
-                let lmr=world.alloc_one_sided_mem_region(array.len()).unwrap();
+                let lmr=world.alloc_one_sided_mem_region(array.len());
                 let slice = lmr.as_mut_slice().unwrap();
                 for i in 0..array.len(){
                     slice[i]=i;
@@ -536,7 +536,7 @@ macro_rules! input_test{
             // scoped SMR<T>------------------------------
             let mut reqs = vec![];
             unsafe {
-                let smr=world.alloc_shared_mem_region(array.len()).block().unwrap();
+                let smr=world.alloc_shared_mem_region(array.len()).block();
                 let slice = smr.as_mut_slice().unwrap();
                 for i in 0..array.len(){
                     slice[i]=i;
diff --git a/tests/array/rdma/blocking_get_test.rs b/tests/array/rdma/blocking_get_test.rs
index 9fc83a93..fe34f4fd 100644
--- a/tests/array/rdma/blocking_get_test.rs
+++ b/tests/array/rdma/blocking_get_test.rs
@@ -127,7 +127,7 @@ macro_rules! blocking_get_test{
             #[allow(unused_mut)]
             let mut array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().unwrap().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
+            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
             //initialize array
             initialize_array!($array, array, $t);
             array.wait_all();
diff --git a/tests/array/rdma/get_test.rs b/tests/array/rdma/get_test.rs
index d3608ec0..261d53f2 100644
--- a/tests/array/rdma/get_test.rs
+++ b/tests/array/rdma/get_test.rs
@@ -128,7 +128,7 @@ macro_rules! get_test{
             let mut array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
             // println!("bout to initialize");
             initialize_array!($array, array, $t);
-            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().unwrap().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
+            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
             //initialize array
 
             array.wait_all();
diff --git a/tests/array/rdma/put_test.rs b/tests/array/rdma/put_test.rs
index 3aefb3bd..335280af 100644
--- a/tests/array/rdma/put_test.rs
+++ b/tests/array/rdma/put_test.rs
@@ -64,7 +64,7 @@ macro_rules! put_test{
             let mut success = true;
             let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().unwrap().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
+            let shared_mem_region: LamellarMemoryRegion<$t> = world.alloc_shared_mem_region(mem_seg_len).block().into(); //Convert into abstract LamellarMemoryRegion, each local segment is total_len
             //initialize array
             let init_val = my_pe as $t;
             initialize_array!($array, array, init_val);

From 6bde847f393c0ce53775306b1f08cc617c4efb2b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Fri, 8 Nov 2024 11:48:39 -0800
Subject: [PATCH 105/116] ensure Darc Drops are awaited properly

---
 src/darc.rs          | 4 ++++
 src/lamellar_team.rs | 9 ++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/darc.rs b/src/darc.rs
index 6e781908..072592c3 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -1620,6 +1620,8 @@ macro_rules! launch_drop {
             );
         }
         // team.print_cnt();
+        team.team_counters.inc_outstanding(1);
+        team.world_counters.inc_outstanding(1); //ensure we don't trigger any warnings in wait all
         let mut am = team.exec_am_local(DroppedWaitAM {
             inner_addr: $inner_addr as *const u8 as usize,
             mode_addr: $inner.mode_addr,
@@ -1629,6 +1631,8 @@ macro_rules! launch_drop {
             phantom: PhantomData::<T>,
         });
         am.launch();
+        team.team_counters.dec_outstanding(1);
+        team.world_counters.dec_outstanding(1);
     };
 }
 
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index db26b706..456a9d9d 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1424,6 +1424,8 @@ impl LamellarTeamRT {
         let mut temp_now = Instant::now();
         let mut orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
         let mut orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
+        let mut world_orig_reqs = self.world_counters.send_req_cnt.load(Ordering::SeqCst);
+        let mut world_orig_launched = self.world_counters.launched_req_cnt.load(Ordering::SeqCst);
 
         // println!(
         //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
@@ -1436,10 +1438,15 @@ impl LamellarTeamRT {
                 || orig_reqs != self.team_counters.send_req_cnt.load(Ordering::SeqCst)
                 || orig_launched != self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
                 || (self.parent.is_none()
-                    && self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0))
+                    && (self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                    || world_orig_reqs != self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                    || world_orig_launched != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
+                ))
         {
             orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
             orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
+            world_orig_reqs = self.world_counters.send_req_cnt.load(Ordering::SeqCst);
+            world_orig_launched = self.world_counters.launched_req_cnt.load(Ordering::SeqCst);
             // std::thread::yield_now();
             // self.flush();
             if std::thread::current().id() != *crate::MAIN_THREAD {

From 90339a5878ca3d0d992356cc6a72f3e3d01f10e2 Mon Sep 17 00:00:00 2001
From: "ryan.friese@pnnl.gov" <ryan.friese@pnnl.gov>
Date: Sun, 10 Nov 2024 16:12:42 -0800
Subject: [PATCH 106/116] update the array.into_* methods to use the handle
 based design pattern since these are remote operations

---
 Cargo.toml                                    |  11 +-
 examples/array_examples/array_batch_add.rs    |   5 +-
 examples/array_examples/dist_array_reduce.rs  |   2 +-
 .../array_examples/distributed_iteration.rs   |   8 +-
 examples/array_examples/histo.rs              |   5 +-
 examples/array_examples/local_iteration.rs    |   8 +-
 examples/array_examples/onesided_iteration.rs |   2 +-
 examples/bandwidths/atomic_array_get_bw.rs    |   8 +-
 examples/bandwidths/atomic_array_put_bw.rs    |   8 +-
 examples/bandwidths/readonly_array_get_bw.rs  |   7 +-
 .../readonly_array_get_unchecked_bw.rs        |   7 +-
 examples/kernels/dft_proxy.rs                 |  21 +-
 examples/kernels/parallel_array_gemm.rs       |   4 +-
 .../kernels/parallel_blocked_array_gemm.rs    |  11 +-
 .../safe_parallel_blocked_array_gemm.rs       |  13 +-
 examples/kernels/serial_array_gemm.rs         |   6 +-
 examples/misc/lamellar_env.rs                 |   6 +-
 src/array.rs                                  |  36 +-
 src/array/atomic.rs                           | 118 +--
 src/array/atomic/iteration.rs                 |   4 +-
 src/array/generic_atomic.rs                   |  29 +-
 src/array/generic_atomic/iteration.rs         |   6 +-
 src/array/global_lock_atomic.rs               | 113 ++-
 src/array/global_lock_atomic/iteration.rs     |  12 +-
 src/array/handle.rs                           | 495 +++++++++++-
 .../distributed_iterator/consumer/collect.rs  |  12 +-
 src/array/local_lock_atomic.rs                | 119 ++-
 src/array/local_lock_atomic/iteration.rs      |   6 +-
 src/array/native_atomic.rs                    |  56 +-
 src/array/native_atomic/iteration.rs          |   6 +-
 src/array/read_only.rs                        | 130 ++--
 src/array/read_only/iteration.rs              |   6 +-
 src/array/unsafe.rs                           | 130 +++-
 src/array/unsafe/iteration.rs                 |   4 +-
 src/darc/handle.rs                            |   6 +-
 src/lamellar_team.rs                          |  25 +-
 src/utils.rs                                  | 234 +++---
 tests/array/arithmetic_ops/add_test.rs        |   8 +-
 tests/array/arithmetic_ops/fetch_add_test.rs  | 719 +++++++++---------
 tests/array/rdma/blocking_get_test.rs         |   8 +-
 tests/array/rdma/get_test.rs                  |   8 +-
 41 files changed, 1498 insertions(+), 924 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 0a2740e7..49f151a8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ categories = ["asynchronous","concurrency", "network-programming","science"]
 lamellar-impl = { version = "0.7.0", path = "impl" }
 #rofisys = { version ="0.3", optional = true }
 rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
-inventory = "0.3" 
+inventory = "0.3"
 serde = { version = "1.0.147", features = ["derive"] }
 serde_bytes = "0.11.7"
 serde_with = "3.0.0"
@@ -84,7 +84,7 @@ default=[]
 opt-level=3
 lto=false
 codegen-units=1
-debug = true   
+debug = true
 
 
 [lib]
@@ -170,10 +170,6 @@ path = "tests/array/atomic_ops/swap_test.rs"
 name = "compare_exchange_test"
 path = "tests/array/atomic_ops/compare_exchange_test.rs"
 
-[[example]]
-name = "array_into_test"
-path = "tests/array/array_into_test.rs"
-
 ##------------ Bandwidth Examples -----------------##
 
 [[example]]
@@ -480,6 +476,3 @@ path="examples/hello_world/hello_world_array.rs"
 [[example]]
 name="hello_world_array_iteration"
 path="examples/hello_world/hello_world_array_iteration.rs"
-
-
-
diff --git a/examples/array_examples/array_batch_add.rs b/examples/array_examples/array_batch_add.rs
index f4891d3f..2a85b386 100644
--- a/examples/array_examples/array_batch_add.rs
+++ b/examples/array_examples/array_batch_add.rs
@@ -28,7 +28,8 @@ fn main() {
         let num_pes = world.num_pes();
         let my_pe = world.my_pe();
         let array_size = 1000000;
-        let array = AtomicArray::<usize>::new(world.clone(), array_size, Distribution::Block).block(); //non intrinsic atomic, non bitwise
+        let array =
+            AtomicArray::<usize>::new(world.clone(), array_size, Distribution::Block).block(); //non intrinsic atomic, non bitwise
                                                                                                //create vec of random indices between 0 & 1000000
         let mut rng = rand::thread_rng();
         let indices = (0..10_000_000)
@@ -110,7 +111,7 @@ fn main() {
             println!("{:?}", timer.elapsed());
         }
         println!("{:?}", world.block_on(array.sum()));
-        let array = array.into_unsafe();
+        let array = array.into_unsafe().block();
 
         world.barrier();
         // let iter = vals.into_iter();
diff --git a/examples/array_examples/dist_array_reduce.rs b/examples/array_examples/dist_array_reduce.rs
index d028ba85..62ffc18c 100644
--- a/examples/array_examples/dist_array_reduce.rs
+++ b/examples/array_examples/dist_array_reduce.rs
@@ -151,7 +151,7 @@ fn main() {
             .for_each(|x| println!("x: {:?}", x))
             .block()
     };
-    let block_array = block_array.into_read_only();
+    let block_array = block_array.into_read_only().block();
     let _ = block_array.sum().block();
 
     let one_elem_array = UnsafeArray::<usize>::new(world.team(), 1, Distribution::Block).block();
diff --git a/examples/array_examples/distributed_iteration.rs b/examples/array_examples/distributed_iteration.rs
index 4c021976..aef6ed5a 100644
--- a/examples/array_examples/distributed_iteration.rs
+++ b/examples/array_examples/distributed_iteration.rs
@@ -5,8 +5,10 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let _num_pes = world.num_pes();
-    let block_array = LocalLockArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
-    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
+    let block_array =
+        LocalLockArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
+    let cyclic_array =
+        AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
 
     // We expose multiple ways to iterate over a lamellar array
     // the first approach introduces what we call a distributed iterator (inspired by Rayon's parallel iterators).
@@ -33,7 +35,7 @@ fn main() {
         .for_each(move |elem| elem.store(my_pe))
         .block();
 
-    // let block_array = block_array.into_read_only();
+    // let block_array = block_array.into_read_only().block();
     block_array.print();
     cyclic_array.print();
 
diff --git a/examples/array_examples/histo.rs b/examples/array_examples/histo.rs
index 0eb5da8e..e1794eb0 100644
--- a/examples/array_examples/histo.rs
+++ b/examples/array_examples/histo.rs
@@ -9,7 +9,9 @@ const NUM_UPDATES_PER_PE: usize = 100000;
 
 fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
-    let array = AtomicArray::<usize>::new(&world, ARRAY_SIZE, lamellar::Distribution::Block).block();
+    let array =
+        AtomicArray::<usize>::new(&world, ARRAY_SIZE, lamellar::Distribution::Block).block();
+
     let mut rng: StdRng = SeedableRng::seed_from_u64(world.my_pe() as u64);
     let range = rand::distributions::Uniform::new(0, ARRAY_SIZE);
 
@@ -19,6 +21,7 @@ fn main() {
             as &mut dyn Iterator<Item = usize>,
         1,
     );
+
     world.block_on(histo);
     world.barrier();
     println!(
diff --git a/examples/array_examples/local_iteration.rs b/examples/array_examples/local_iteration.rs
index 13870dc2..8bc26322 100644
--- a/examples/array_examples/local_iteration.rs
+++ b/examples/array_examples/local_iteration.rs
@@ -5,8 +5,10 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let _num_pes = world.num_pes();
-    let block_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
-    let cyclic_array = AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
+    let block_array =
+        AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Block).block();
+    let cyclic_array =
+        AtomicArray::<usize>::new(world.team(), ARRAY_LEN, Distribution::Cyclic).block();
 
     // We expose multiple ways to iterate over a lamellar array
     // the first approach introduces what we call a distributed iterator (inspired by Rayon's parallel iterators).
@@ -29,7 +31,7 @@ fn main() {
         .for_each(move |elem| elem.store(my_pe))
         .block();
 
-    // let block_array = block_array.into_read_only();
+    // let block_array = block_array.into_read_only().block();
     block_array.print();
     cyclic_array.print();
 
diff --git a/examples/array_examples/onesided_iteration.rs b/examples/array_examples/onesided_iteration.rs
index 36e75687..95d90b28 100644
--- a/examples/array_examples/onesided_iteration.rs
+++ b/examples/array_examples/onesided_iteration.rs
@@ -127,7 +127,7 @@ fn main() {
     //     let array = LocalLockArray::<usize>::new(&world, 8, Distribution::Block).block();
     //     let my_pe = world.my_pe();
     //     let num_pes = world.num_pes();
-    let block_array = block_array.into_local_lock();
+    let block_array = block_array.into_local_lock().block();
     block_array
         .dist_iter_mut()
         .for_each(move |e| *e = my_pe)
diff --git a/examples/bandwidths/atomic_array_get_bw.rs b/examples/bandwidths/atomic_array_get_bw.rs
index 7120c97d..edaed240 100644
--- a/examples/bandwidths/atomic_array_get_bw.rs
+++ b/examples/bandwidths/atomic_array_get_bw.rs
@@ -24,9 +24,9 @@ fn main() {
         .local_iter_mut()
         .for_each(move |elem| *elem = num_pes as u8)
         .block(); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, so use locallock for initializiation
-    let array = array.into_atomic(); //this enforces a wait_all and barrier
-                                     // array.wait_all();
-                                     // array.barrier();
+    let array = array.into_atomic().block(); //this enforces a wait_all and barrier
+                                             // array.wait_all();
+                                             // array.barrier();
 
     world.barrier();
     let s = Instant::now();
@@ -101,7 +101,7 @@ fn main() {
             (sum as f64 / 1048576.0) / cur_t, // throughput of user payload
             ((sum*(num_pes-1) as u64) as f64 / 1048576.0) / cur_t,
             cur - old, //total bytes sent including overhead
-            (cur - old) as f64 / cur_t, //throughput including overhead 
+            (cur - old) as f64 / cur_t, //throughput including overhead
             (mbs_c -mbs_o )/ cur_t,
             (cur_t/cnt as f64) * 1_000_000 as f64 ,
         );
diff --git a/examples/bandwidths/atomic_array_put_bw.rs b/examples/bandwidths/atomic_array_put_bw.rs
index 393dbee4..86e67682 100644
--- a/examples/bandwidths/atomic_array_put_bw.rs
+++ b/examples/bandwidths/atomic_array_put_bw.rs
@@ -24,7 +24,7 @@ fn main() {
         .dist_iter_mut()
         .for_each(move |elem| *elem = 255 as u8)
         .block(); //this is can be pretty slow for atomic arrays as we perform an atomic store for 2^30 elements, local lock tends to perform better
-    let mut array = array.into_atomic(); //so we simply convert the LocalLockArray array to atomic after initalization
+    let mut array = array.into_atomic().block(); //so we simply convert the LocalLockArray array to atomic after initalization
 
     world.barrier();
     let s = Instant::now();
@@ -91,7 +91,7 @@ fn main() {
             (sum as f64 / 1048576.0) / cur_t, // throughput of user payload
             ((sum*(num_pes-1) as u64) as f64 / 1048576.0) / cur_t,
             cur - old, //total bytes sent including overhead
-            (cur - old) as f64 / cur_t, //throughput including overhead 
+            (cur - old) as f64 / cur_t, //throughput including overhead
             (mbs_c -mbs_o )/ cur_t,
             (cur_t/cnt as f64) * 1_000_000 as f64 ,
         );
@@ -103,11 +103,11 @@ fn main() {
         //         i.store(255 as u8);
         //     }
         // };
-        let temp = array.into_local_lock();
+        let temp = array.into_local_lock().block();
         temp.dist_iter_mut()
             .for_each(move |elem| *elem = 255 as u8)
             .block(); //this is pretty slow for atomic arrays as we perform an atomic store for 2^30 elements
-        array = temp.into_atomic();
+        array = temp.into_atomic().block();
         world.barrier();
     }
     if my_pe == 0 {
diff --git a/examples/bandwidths/readonly_array_get_bw.rs b/examples/bandwidths/readonly_array_get_bw.rs
index 5055b9ed..4ee9a860 100644
--- a/examples/bandwidths/readonly_array_get_bw.rs
+++ b/examples/bandwidths/readonly_array_get_bw.rs
@@ -13,7 +13,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let array: UnsafeArray<u8> =
+        UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
     let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
@@ -26,7 +27,7 @@ fn main() {
     }
 
     array.barrier();
-    let array = array.into_read_only();
+    let array = array.into_read_only().block();
 
     world.barrier();
     let s = Instant::now();
@@ -99,7 +100,7 @@ fn main() {
             (sum as f64 / 1048576.0) / cur_t, // throughput of user payload
             ((sum*(num_pes-1) as u64) as f64 / 1048576.0) / cur_t,
             cur - old, //total bytes sent including overhead
-            (cur - old) as f64 / cur_t, //throughput including overhead 
+            (cur - old) as f64 / cur_t, //throughput including overhead
             (mbs_c -mbs_o )/ cur_t,
             (cur_t/cnt as f64) * 1_000_000 as f64 ,
         );
diff --git a/examples/bandwidths/readonly_array_get_unchecked_bw.rs b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
index 76c27105..e4124292 100644
--- a/examples/bandwidths/readonly_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/readonly_array_get_unchecked_bw.rs
@@ -12,7 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let array: UnsafeArray<u8> =
+        UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
     let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
@@ -29,7 +30,7 @@ fn main() {
             .block();
     }
     array.barrier();
-    let array = array.into_read_only();
+    let array = array.into_read_only().block();
 
     world.barrier();
     let s = Instant::now();
@@ -96,7 +97,7 @@ fn main() {
             (sum as f64 / 1048576.0) / cur_t, // throughput of user payload
             ((sum*(num_pes-1) as u64) as f64 / 1048576.0) / cur_t,
             cur - old, //total bytes sent including overhead
-            (cur - old) as f64 / cur_t, //throughput including overhead 
+            (cur - old) as f64 / cur_t, //throughput including overhead
             (mbs_c -mbs_o )/ cur_t,
             (cur_t/cnt as f64) * 1_000_000 as f64 ,
         );
diff --git a/examples/kernels/dft_proxy.rs b/examples/kernels/dft_proxy.rs
index 2f7d201d..7c3f8a90 100644
--- a/examples/kernels/dft_proxy.rs
+++ b/examples/kernels/dft_proxy.rs
@@ -636,18 +636,9 @@ fn main() {
         let global_len = num_pes * array_len;
 
         println!("my_pe {:?} num_pes {:?}", my_pe, num_pes);
-        let partial_sum = world
-            .alloc_shared_mem_region::<f64>(num_pes)
-            .block()
-            ;
-        let partial_spectrum = world
-            .alloc_shared_mem_region::<f64>(array_len)
-            .block()
-            ;
-        let partial_signal = world
-            .alloc_shared_mem_region::<f64>(array_len)
-            .block()
-            ;
+        let partial_sum = world.alloc_shared_mem_region::<f64>(num_pes).block();
+        let partial_spectrum = world.alloc_shared_mem_region::<f64>(array_len).block();
+        let partial_signal = world.alloc_shared_mem_region::<f64>(array_len).block();
         let full_signal = world.alloc_one_sided_mem_region::<f64>(global_len);
         let full_spectrum = world.alloc_one_sided_mem_region::<f64>(global_len);
         let magic = world.alloc_one_sided_mem_region::<f64>(num_pes);
@@ -886,8 +877,8 @@ fn main() {
         //     .for_each(|elem| *elem = 0.0);
         // full_spectrum_array.wait_all();
         // full_spectrum_array.barrier();
-        let full_signal_array = full_signal_array.into_read_only();
-        let full_spectrum_array = full_spectrum_array.into_atomic();
+        let full_signal_array = full_signal_array.into_read_only().block();
+        let full_spectrum_array = full_spectrum_array.into_atomic().block();
 
         for _i in 0..num_trials {
             // let timer = Instant::now();
@@ -915,7 +906,7 @@ fn main() {
         }
         ti += 1;
 
-        let full_spectrum_array = full_spectrum_array.into_local_lock();
+        let full_spectrum_array = full_spectrum_array.into_local_lock().block();
         for _i in 0..num_trials {
             // let timer = Instant::now();
             times[ti].push(dft_lamellar_array_opt_3(
diff --git a/examples/kernels/parallel_array_gemm.rs b/examples/kernels/parallel_array_gemm.rs
index 871f6f7d..282bc258 100644
--- a/examples/kernels/parallel_array_gemm.rs
+++ b/examples/kernels/parallel_array_gemm.rs
@@ -55,8 +55,8 @@ fn main() {
 
     world.wait_all();
     world.barrier();
-    let a = a.into_read_only();
-    let b = b.into_read_only();
+    let a = a.into_read_only().block();
+    let b = b.into_read_only().block();
 
     let num_gops = ((2 * dim * dim * dim) - dim * dim) as f64 / 1_000_000_000.0; // accurate for square matrices
 
diff --git a/examples/kernels/parallel_blocked_array_gemm.rs b/examples/kernels/parallel_blocked_array_gemm.rs
index 382eee93..50b26b91 100644
--- a/examples/kernels/parallel_blocked_array_gemm.rs
+++ b/examples/kernels/parallel_blocked_array_gemm.rs
@@ -39,7 +39,7 @@ fn main() {
     let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block).block(); //row major
     let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block).block(); //col major
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block).block(); //row major
-                                                                         //initialize
+                                                                                 //initialize
     a.dist_iter_mut()
         .enumerate()
         .for_each(|(i, x)| *x = i as f32)
@@ -59,8 +59,8 @@ fn main() {
         .block();
     c.dist_iter_mut().for_each(|x| x.store(0.0)).block();
     world.barrier();
-    let a = a.into_read_only();
-    let b = b.into_read_only();
+    let a = a.into_read_only().block();
+    let b = b.into_read_only().block();
 
     let num_gops = ((2 * dim * dim * dim) - dim * dim) as f64 / 1_000_000_000.0; // accurate for square matrices
     let blocksize = dim / num_pes;
@@ -75,7 +75,8 @@ fn main() {
     // we construct a global array where each pe will contain the sequence (0..n_blks)
     // we can then call dist_iter() on this array to iterate over the range in parallel on each PE
     let nblks_array =
-        LocalLockArray::<Block>::new(&world, (n_blks * n_blks) * num_pes, Distribution::Block).block();
+        LocalLockArray::<Block>::new(&world, (n_blks * n_blks) * num_pes, Distribution::Block)
+            .block();
 
     nblks_array
         .dist_iter_mut()
@@ -86,7 +87,7 @@ fn main() {
             x.k = i % n_blks
         })
         .block();
-    let nblks_array = nblks_array.into_read_only();
+    let nblks_array = nblks_array.into_read_only().block();
 
     let start = std::time::Instant::now();
     let a_clone = a.clone();
diff --git a/examples/kernels/safe_parallel_blocked_array_gemm.rs b/examples/kernels/safe_parallel_blocked_array_gemm.rs
index 98fe0d0f..070276e3 100644
--- a/examples/kernels/safe_parallel_blocked_array_gemm.rs
+++ b/examples/kernels/safe_parallel_blocked_array_gemm.rs
@@ -32,7 +32,7 @@ fn main() {
     let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block).block(); //row major -- we will change this into a readonly array after initialization
     let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block).block(); //col major -- we will change this into a readonly array after initialization
     let c = LocalLockArray::<f32>::new(&world, m * p, Distribution::Block).block(); //row major
-                                                                            //initialize
+                                                                                    //initialize
     let a_init = a
         .dist_iter_mut()
         .enumerate()
@@ -49,8 +49,8 @@ fn main() {
     });
     let c_init = c.dist_iter_mut().for_each(move |x| *x = 0.0);
     world.block_on_all([a_init, b_init, c_init]);
-    let a = a.into_read_only();
-    let b = b.into_read_only();
+    let a = a.into_read_only().block();
+    let b = b.into_read_only().block();
 
     world.barrier();
 
@@ -76,7 +76,8 @@ fn main() {
         .for_each(move |(i, x)| *x = i % n_blks)
         .block();
 
-    let m_blks_pe_array = LocalLockArray::new(&world, m_blks_pe * num_pes, Distribution::Block).block();
+    let m_blks_pe_array =
+        LocalLockArray::new(&world, m_blks_pe * num_pes, Distribution::Block).block();
 
     m_blks_pe_array
         .dist_iter_mut()
@@ -84,8 +85,8 @@ fn main() {
         .for_each(move |(i, x)| *x = i % m_blks_pe)
         .block();
     world.barrier();
-    let nblks_array = nblks_array.into_read_only();
-    let m_blks_pe_array = m_blks_pe_array.into_read_only();
+    let nblks_array = nblks_array.into_read_only().block();
+    let m_blks_pe_array = m_blks_pe_array.into_read_only().block();
     println!("{blocksize} {m_blks} {m_blks_pe} {n_blks} {p_blks}");
 
     let start = std::time::Instant::now();
diff --git a/examples/kernels/serial_array_gemm.rs b/examples/kernels/serial_array_gemm.rs
index 5016596b..029ba264 100644
--- a/examples/kernels/serial_array_gemm.rs
+++ b/examples/kernels/serial_array_gemm.rs
@@ -26,7 +26,7 @@ fn main() {
     let a = LocalLockArray::<f32>::new(&world, m * n, Distribution::Block).block(); //row major
     let b = LocalLockArray::<f32>::new(&world, n * p, Distribution::Block).block(); //col major
     let c = AtomicArray::<f32>::new(&world, m * p, Distribution::Block).block(); //row major
-                                                                         //initialize matrices
+                                                                                 //initialize matrices
 
     a.dist_iter_mut()
         .enumerate()
@@ -49,8 +49,8 @@ fn main() {
 
     world.barrier();
 
-    let a = a.into_read_only();
-    let b = b.into_read_only();
+    let a = a.into_read_only().block();
+    let b = b.into_read_only().block();
 
     let num_gops = ((2 * dim * dim * dim) - dim * dim) as f64 / 1_000_000_000.0; // accurate for square matrices
 
diff --git a/examples/misc/lamellar_env.rs b/examples/misc/lamellar_env.rs
index 7136b171..ef8086f9 100644
--- a/examples/misc/lamellar_env.rs
+++ b/examples/misc/lamellar_env.rs
@@ -31,13 +31,13 @@ fn main() {
     print_env(&grw_darc);
     println!("environment from UnsafeArray");
     print_env(&array);
-    let array = array.into_atomic();
+    let array = array.into_atomic().block();
     println!("environment from AtomicArray");
     print_env(&array);
-    let array = array.into_local_lock();
+    let array = array.into_local_lock().block();
     println!("environment from LocalOnlyArray");
     print_env(&array);
-    let array = array.into_global_lock();
+    let array = array.into_global_lock().block();
     println!("environment from GlobalLockArray");
     print_env(&array);
     if world.my_pe() % 2 == 0 {
diff --git a/src/array.rs b/src/array.rs
index 745015ec..d5f30a93 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -25,7 +25,7 @@
 //! # Safety
 //! Array Data Lifetimes: LamellarArrays are built upon [Darcs][crate::darc::Darc] (Distributed Atomic Reference Counting Pointers) and as such have distributed lifetime management.
 //! This means that as long as a single reference to an array exists anywhere in the distributed system, the data for the entire array will remain valid on every PE (even though a given PE may have dropped all its local references).
-//! While the compiler handles lifetimes within the context of a single PE, our distributed lifetime management relies on "garbage collecting active messages" to ensure all remote references have been accounted for.  
+//! While the compiler handles lifetimes within the context of a single PE, our distributed lifetime management relies on "garbage collecting active messages" to ensure all remote references have been accounted for.
 //!
 //! # Multiple array types
 //! We provide several array types, each with their own saftey gaurantees with respect to how data is accessed (further details can be found in the documentation for each type)
@@ -51,11 +51,11 @@
 //! let world = LamellarWorldBuilder::new().build();
 //! let array =  UnsafeArray::<usize>::new(&world, 10,Distribution::Block).block();
 //!
-//! // convert between array types    
-//! let array = array.into_local_lock(); // LocalLockArray
-//! let array = array.into_global_lock(); // GlobalLockArray
-//! let array = array.into_atomic(); // AtomicArray
-//! let array = array.into_read_only(); // ReadOnlyArray
+//! // convert between array types
+//! let array = array.into_local_lock().block(); // LocalLockArray
+//! let array = array.into_global_lock().block(); // GlobalLockArray
+//! let array = array.into_atomic().block(); // AtomicArray
+//! let array = array.into_read_only().block(); // ReadOnlyArray
 //!
 //! // get a reference to the underlying slice: &[usize]
 //! let local_data = array.local_data();
@@ -533,7 +533,8 @@ pub trait TeamFrom<T: ?Sized> {
 // #[async_trait]
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 /// and to be used within an async context
-pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> + Sized {
+// pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> + Sized {
+pub trait AsyncTeamFrom<T: ?Sized>: Sized {
     /// Converts to this type from the input type
     fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> impl Future<Output = Self> + Send;
 }
@@ -1087,7 +1088,6 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///```
     fn num_elems_local(&self) -> usize;
 
-
     #[doc(alias("One-sided", "onesided"))]
     /// Given a global index, calculate the PE and offset on that PE where the element actually resides.
     /// Returns None if the index is Out of bounds
@@ -1441,7 +1441,7 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     /// let len = buf.len();
     /// let _ = array.dist_iter_mut().for_each(move |elem| *elem = len).spawn(); //we will used this val as completion detection
     ///
-    /// //Safe as we are this is the only reference to buf   
+    /// //Safe as we are this is the only reference to buf
     /// unsafe {
     ///     for (i,elem) in buf.as_mut_slice()
     ///                       .expect("we just created it so we know its local")
@@ -1458,9 +1458,9 @@ pub trait LamellarArrayPut<T: Dist>: LamellarArrayInternalPut<T> {
     ///     println!();
     /// }
     /// array.barrier(); //block other PEs until PE0 has finised "putting" the data
-    ///    
+    ///
     /// println!("PE{my_pe} array data: {:?}",array.read_local_data().block());
-    ///     
+    ///
     ///
     ///```
     /// Possible output on A 4 PE system (ordering with respect to PEs may change)
@@ -1621,7 +1621,7 @@ pub trait ArrayPrint<T: Dist + std::fmt::Debug>: LamellarArray<T> {
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
 ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
 /// }).block();
-/// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+/// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
 /// let sum = array.block_on(array.sum()).expect("array len > 0"); // No updates occuring anywhere anymore so we have a deterministic result
 /// assert_eq!(array.len()*num_pes,sum);
 ///```
@@ -1670,7 +1670,7 @@ where
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
+    /// the returned reduction result is only available on the calling PE
     ///
     /// # Examples
     /// ```
@@ -1685,7 +1685,7 @@ where
     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
     ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     /// }).block();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
     /// let sum = array.block_on(array.reduce("sum")).expect("array len > 0"); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
@@ -1718,7 +1718,7 @@ where
 //     ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
 //     ///     array_clone.add(index,1); //randomly at one to an element in the array.
 //     /// });
-//     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+//     /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
 //     /// let sum = array.block_on(array.sum());
 //     /// assert_eq!(array.len()*num_pes,sum);
 //     ///```
@@ -1771,7 +1771,7 @@ where
 //     /// let num_pes = world.num_pes();
 //     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
 //     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-//     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+//     /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
 //     /// let max = array.block_on(array.max());
 //     /// assert_eq!((array.len()-1)*2,max);
 //     ///```
@@ -1793,7 +1793,7 @@ where
 //     /// let num_pes = world.num_pes();
 //     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
 //     /// let req = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2));
-//     /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+//     /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
 //     /// let min = array.block_on(array.min());
 //     /// assert_eq!(0,min);
 //     ///```
@@ -1836,7 +1836,7 @@ where
 ///     let index = rand::thread_rng().gen_range(0..array_clone.len());
 ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
 /// }).block();
-/// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+/// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
 /// let sum = array.block_on(array.sum());
 /// let my_sum = array.block_on(array.reduce("my_sum")); //pass a &str containing the reduction to use
 /// assert_eq!(sum,my_sum);
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 02ddd895..9d8c5504 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -916,7 +916,7 @@ impl<T: Dist> AtomicArray<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as an [AtomicLocalData], which allows safe access to local elements.   
+    /// Return the calling PE's local data as an [AtomicLocalData], which allows safe access to local elements.
     ///
     /// Because each element is Atomic, this handle to the local data can be used to both read and write individual elements safely.
     ///
@@ -940,7 +940,7 @@ impl<T: Dist> AtomicArray<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Return the calling PE's local data as an [AtomicLocalData], which allows safe mutable access to local elements.   
+    /// Return the calling PE's local data as an [AtomicLocalData], which allows safe mutable access to local elements.
     ///
     /// Because each element is Atomic, this handle to the local data can be used to both read and write individual elements safely.
     ///
@@ -999,7 +999,7 @@ impl<T: Dist> AtomicArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     ///```
     ///
     /// # Warning
@@ -1017,15 +1017,15 @@ impl<T: Dist> AtomicArray<T> {
     /// // but array1 will not be dropped until after 'slice' is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_unsafe" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     /// unsafe_array.print();
     /// println!("{:?}",slice.at(0).load());
     ///```
-    pub fn into_unsafe(self) -> UnsafeArray<T> {
+    pub fn into_unsafe(self) -> IntoUnsafeArrayHandle<T> {
         // println!("atomic into_unsafe");
         match self {
-            AtomicArray::NativeAtomicArray(array) => array.into(),
-            AtomicArray::GenericAtomicArray(array) => array.into(),
+            AtomicArray::NativeAtomicArray(array) => array.into_unsafe(),
+            AtomicArray::GenericAtomicArray(array) => array.into_unsafe(),
         }
     }
     // pub fn into_local_only(self) -> LocalOnlyArray<T> {
@@ -1054,7 +1054,7 @@ impl<T: Dist> AtomicArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -1071,15 +1071,15 @@ impl<T: Dist> AtomicArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_read_only" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     /// read_only_array.print();
     /// println!("{:?}",slice.at(0).load());
     ///```
-    pub fn into_read_only(self) -> ReadOnlyArray<T> {
+    pub fn into_read_only(self) -> IntoReadOnlyArrayHandle<T> {
         // println!("atomic into_read_only");
         match self {
-            AtomicArray::NativeAtomicArray(array) => array.array.into(),
-            AtomicArray::GenericAtomicArray(array) => array.array.into(),
+            AtomicArray::NativeAtomicArray(array) => array.array.into_read_only(),
+            AtomicArray::GenericAtomicArray(array) => array.array.into_read_only(),
         }
     }
 
@@ -1101,7 +1101,7 @@ impl<T: Dist> AtomicArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let local_lock_array = array.into_local_lock();
+    /// let local_lock_array = array.into_local_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -1118,15 +1118,15 @@ impl<T: Dist> AtomicArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_local_lock" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let local_lock_array = array.into_local_lock();
+    /// let local_lock_array = array.into_local_lock().block();
     /// local_lock_array.print();
     /// println!("{:?}",slice.at(0).load());
     ///```
-    pub fn into_local_lock(self) -> LocalLockArray<T> {
+    pub fn into_local_lock(self) -> IntoLocalLockArrayHandle<T> {
         // println!("atomic into_local_lock");
         match self {
-            AtomicArray::NativeAtomicArray(array) => array.array.into(),
-            AtomicArray::GenericAtomicArray(array) => array.array.into(),
+            AtomicArray::NativeAtomicArray(array) => array.array.into_local_lock(),
+            AtomicArray::GenericAtomicArray(array) => array.array.into_local_lock(),
         }
     }
 
@@ -1148,7 +1148,7 @@ impl<T: Dist> AtomicArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: AtomicArray<usize> = AtomicArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -1165,27 +1165,27 @@ impl<T: Dist> AtomicArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_global_lock" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     /// global_lock_array.print();
     /// println!("{:?}",slice.at(0).load());
     ///```
-    pub fn into_global_lock(self) -> GlobalLockArray<T> {
+    pub fn into_global_lock(self) -> IntoGlobalLockArrayHandle<T> {
         // println!("atomic into_global_lock");
         match self {
-            AtomicArray::NativeAtomicArray(array) => array.array.into(),
-            AtomicArray::GenericAtomicArray(array) => array.array.into(),
+            AtomicArray::NativeAtomicArray(array) => array.array.into_global_lock(),
+            AtomicArray::GenericAtomicArray(array) => array.array.into_global_lock(),
         }
     }
 }
 
-impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
-    fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let (vals, distribution) = input;
-        let input = (&vals, distribution);
-        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-        array.into()
-    }
-}
+// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
+//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         let (vals, distribution) = input;
+//         let input = (&vals, distribution);
+//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
+//         array.into()
+//     }
+// }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
@@ -1195,16 +1195,16 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T
     }
 }
 
-impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
-    fn from(array: UnsafeArray<T>) -> Self {
-        // println!("Converting from UnsafeArray to AtomicArray");
-        if NATIVE_ATOMICS.contains(&TypeId::of::<T>()) {
-            NativeAtomicArray::from(array).into()
-        } else {
-            GenericAtomicArray::from(array).into()
-        }
-    }
-}
+// impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
+//     fn from(array: UnsafeArray<T>) -> Self {
+//         // println!("Converting from UnsafeArray to AtomicArray");
+//         if NATIVE_ATOMICS.contains(&TypeId::of::<T>()) {
+//             NativeAtomicArray::from(array).into()
+//         } else {
+//             GenericAtomicArray::from(array).into()
+//         }
+//     }
+// }
 
 #[async_trait]
 impl<T: Dist + 'static> AsyncFrom<UnsafeArray<T>> for AtomicArray<T> {
@@ -1225,25 +1225,25 @@ impl<T: Dist + 'static> AsyncFrom<UnsafeArray<T>> for AtomicArray<T> {
 //     }
 // }
 
-impl<T: Dist + 'static> From<ReadOnlyArray<T>> for AtomicArray<T> {
-    fn from(array: ReadOnlyArray<T>) -> Self {
-        // println!("Converting from ReadOnlyArray to AtomicArray");
-        unsafe { array.into_inner().into() }
-    }
-}
-impl<T: Dist + 'static> From<LocalLockArray<T>> for AtomicArray<T> {
-    fn from(array: LocalLockArray<T>) -> Self {
-        // println!("Converting from LocalLockArray to AtomicArray");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist + 'static> From<ReadOnlyArray<T>> for AtomicArray<T> {
+//     fn from(array: ReadOnlyArray<T>) -> Self {
+//         // println!("Converting from ReadOnlyArray to AtomicArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
+// impl<T: Dist + 'static> From<LocalLockArray<T>> for AtomicArray<T> {
+//     fn from(array: LocalLockArray<T>) -> Self {
+//         // println!("Converting from LocalLockArray to AtomicArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
-impl<T: Dist + 'static> From<GlobalLockArray<T>> for AtomicArray<T> {
-    fn from(array: GlobalLockArray<T>) -> Self {
-        // println!("Converting from GlobalLockArray to AtomicArray");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist + 'static> From<GlobalLockArray<T>> for AtomicArray<T> {
+//     fn from(array: GlobalLockArray<T>) -> Self {
+//         // println!("Converting from GlobalLockArray to AtomicArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
 impl<T: Dist> From<AtomicArray<T>> for AtomicByteArray {
     fn from(array: AtomicArray<T>) -> Self {
@@ -1291,7 +1291,7 @@ impl<T: Dist + AmDist + 'static> AtomicArray<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
+    /// the returned reduction result is only available on the calling PE
     ///
     ///  # Safety
     /// One thing to consider is that due to being a one sided reduction, safety is only gauranteed with respect to Atomicity of individual elements,
diff --git a/src/array/atomic/iteration.rs b/src/array/atomic/iteration.rs
index 73c2a231..e32f1052 100644
--- a/src/array/atomic/iteration.rs
+++ b/src/array/atomic/iteration.rs
@@ -208,12 +208,12 @@ impl<T: Dist> LamellarArrayIterators<T> for AtomicArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.clone().into(), LamellarArray::team_rt(self).clone(), 1)
+        OneSidedIter::new(self.clone(), LamellarArray::team_rt(self).clone(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.clone().into(),
+            self.clone(),
             LamellarArray::team_rt(self).clone(),
             std::cmp::min(buf_size, self.len()),
         )
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 247b951d..96adb94f 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -540,27 +540,32 @@ impl<T: Dist> GenericAtomicArray<T> {
     // }
 
     //#[doc(hidden)]
-    pub fn into_unsafe(self) -> UnsafeArray<T> {
+    pub fn into_unsafe(self) -> IntoUnsafeArrayHandle<T> {
         // println!("generic into_unsafe");
-        self.array.into()
+        // self.array.into()
+        IntoUnsafeArrayHandle {
+            team: self.array.inner.data.team.clone(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
     //#[doc(hidden)]
-    pub fn into_read_only(self) -> ReadOnlyArray<T> {
+    pub fn into_read_only(self) -> IntoReadOnlyArrayHandle<T> {
         // println!("generic into_read_only");
-        self.array.into()
+        self.array.into_read_only()
     }
 
     //#[doc(hidden)]
-    pub fn into_local_lock(self) -> LocalLockArray<T> {
+    pub fn into_local_lock(self) -> IntoLocalLockArrayHandle<T> {
         // println!("generic into_local_lock");
-        self.array.into()
+        self.array.into_local_lock()
     }
 
     //#[doc(hidden)]
-    pub fn into_global_lock(self) -> GlobalLockArray<T> {
+    pub fn into_global_lock(self) -> IntoGlobalLockArrayHandle<T> {
         // println!("generic into_local_lock");
-        self.array.into()
+        self.array.into_global_lock()
     }
 
     //#[doc(hidden)]
@@ -588,9 +593,9 @@ impl<T: Dist> GenericAtomicArray<T> {
 
 impl<T: Dist + 'static> GenericAtomicArray<T> {
     #[doc(hidden)]
-    pub fn into_atomic(self) -> GenericAtomicArray<T> {
+    pub fn into_atomic(self) -> IntoAtomicArrayHandle<T> {
         // println!("generic into_atomic");
-        self.array.into()
+        self.array.into_atomic()
     }
 }
 
@@ -704,7 +709,7 @@ impl<T: Dist> From<GenericAtomicByteArray> for AtomicArray<T> {
 
 impl<T: Dist> private::ArrayExecAm<T> for GenericAtomicArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
         self.array.team_counters()
@@ -791,7 +796,7 @@ impl<T: Dist> ActiveMessaging for GenericAtomicArray<T> {
 
 impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
diff --git a/src/array/generic_atomic/iteration.rs b/src/array/generic_atomic/iteration.rs
index 4b777940..a59f86d0 100644
--- a/src/array/generic_atomic/iteration.rs
+++ b/src/array/generic_atomic/iteration.rs
@@ -191,13 +191,13 @@ impl<T: Dist> LamellarArrayIterators<T> for GenericAtomicArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.clone().into(), self.array.team_rt().clone(), 1)
+        OneSidedIter::new(self.clone(), self.array.team_rt(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.clone().into(),
-            self.array.team_rt().clone(),
+            self.clone(),
+            self.array.team_rt(),
             std::cmp::min(buf_size, self.len()),
         )
     }
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index d03208de..dbd0058e 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -424,7 +424,7 @@ impl<T: Dist> GlobalLockArray<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Return a handle for accessing the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [GlobalLockLocalData], which allows safe immutable access to local elements.
     ///
     /// The returned handle must be await'd `.read_local_data().await` within an async context or
     /// it must be blocked on `.read_local_data().block()` in a non async context to actually acquire the lock
@@ -458,7 +458,7 @@ impl<T: Dist> GlobalLockArray<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Return a handle for accessing the calling PE's local data as a  [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a  [GlobalLockMutLocalData], which allows safe mutable access to local elements.
     ///
     /// The returned handle must be await'd `.write_local_data().await` within an async context or
     /// it must be blocked on `.write_local_data().block()` in a non async context to actually acquire the lock
@@ -491,7 +491,7 @@ impl<T: Dist> GlobalLockArray<T> {
     }
 
     #[doc(alias("Collective"))]
-    /// Return a handle for accessing the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [GlobalLockMutLocalData], which allows safe mutable access to local elements.
     /// All PEs associated with the array must call this function in order to access their own local data simultaneously
     ///
     /// The returned handle must be await'd `.collective_write_local_data().await` within an async context or
@@ -553,7 +553,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     ///```
     ///
     /// # Warning
@@ -571,12 +571,16 @@ impl<T: Dist> GlobalLockArray<T> {
     /// // but array1 will not be dropped until after 'slice' is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_unsafe" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     /// unsafe_array.print();
     /// println!("{slice:?}");
-    pub fn into_unsafe(self) -> UnsafeArray<T> {
+    pub fn into_unsafe(self) -> IntoUnsafeArrayHandle<T> {
         // println!("GlobalLock into_unsafe");
-        self.array.into()
+        IntoUnsafeArrayHandle {
+            team: self.array.inner.data.team.clone(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
     // pub fn into_local_only(self) -> LocalOnlyArray<T> {
@@ -602,7 +606,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -619,13 +623,13 @@ impl<T: Dist> GlobalLockArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_read_only" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     /// read_only_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_read_only(self) -> ReadOnlyArray<T> {
+    pub fn into_read_only(self) -> IntoReadOnlyArrayHandle<T> {
         // println!("GlobalLock into_read_only");
-        self.array.into()
+        self.array.into_read_only()
     }
 
     #[doc(alias = "Collective")]
@@ -646,7 +650,7 @@ impl<T: Dist> GlobalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -663,13 +667,13 @@ impl<T: Dist> GlobalLockArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_read_only" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     /// read_only_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_local_lock(self) -> GlobalLockArray<T> {
+    pub fn into_local_lock(self) -> IntoLocalLockArrayHandle<T> {
         // println!("GlobalLock into_read_only");
-        self.array.into()
+        self.array.into_local_lock()
     }
 }
 
@@ -692,7 +696,7 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: GlobalLockArray<usize> = GlobalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let atomic_array = array.into_atomic();
+    /// let atomic_array = array.into_atomic().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -709,24 +713,24 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_atomic" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let atomic_array = array.into_atomic();
+    /// let atomic_array = array.into_atomic().block();
     /// atomic_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_atomic(self) -> AtomicArray<T> {
+    pub fn into_atomic(self) -> IntoAtomicArrayHandle<T> {
         // println!("GlobalLock into_atomic");
-        self.array.into()
+        self.array.into_atomic()
     }
 }
 
-impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
-    fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let (vals, distribution) = input;
-        let input = (&vals, distribution);
-        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-        array.into()
-    }
-}
+// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
+//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         let (vals, distribution) = input;
+//         let input = (&vals, distribution);
+//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
+//         array.into()
+//     }
+// }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
@@ -736,19 +740,6 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GlobalLockArr
     }
 }
 
-impl<T: Dist> From<UnsafeArray<T>> for GlobalLockArray<T> {
-    fn from(array: UnsafeArray<T>) -> Self {
-        // println!("GlobalLock from unsafe");
-        array.block_on_outstanding(DarcMode::GlobalLockArray);
-        let lock = GlobalRwDarc::new(array.team_rt(), ()).block().unwrap();
-
-        GlobalLockArray {
-            lock: lock,
-            array: array,
-        }
-    }
-}
-
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
@@ -770,26 +761,26 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
 //     }
 // }
 
-impl<T: Dist> From<AtomicArray<T>> for GlobalLockArray<T> {
-    fn from(array: AtomicArray<T>) -> Self {
-        // println!("GlobalLock from atomic");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<AtomicArray<T>> for GlobalLockArray<T> {
+//     fn from(array: AtomicArray<T>) -> Self {
+//         // println!("GlobalLock from atomic");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
-impl<T: Dist> From<ReadOnlyArray<T>> for GlobalLockArray<T> {
-    fn from(array: ReadOnlyArray<T>) -> Self {
-        // println!("GlobalLock from readonly");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<ReadOnlyArray<T>> for GlobalLockArray<T> {
+//     fn from(array: ReadOnlyArray<T>) -> Self {
+//         // println!("GlobalLock from readonly");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
-impl<T: Dist> From<LocalLockArray<T>> for GlobalLockArray<T> {
-    fn from(array: LocalLockArray<T>) -> Self {
-        // println!("GlobalLock from LocalLockArray");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<LocalLockArray<T>> for GlobalLockArray<T> {
+//     fn from(array: LocalLockArray<T>) -> Self {
+//         // println!("GlobalLock from LocalLockArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
 impl<T: Dist> From<GlobalLockArray<T>> for GlobalLockByteArray {
     fn from(array: GlobalLockArray<T>) -> Self {
@@ -829,7 +820,7 @@ impl<T: Dist> From<GlobalLockByteArray> for GlobalLockArray<T> {
 
 impl<T: Dist> private::ArrayExecAm<T> for GlobalLockArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
         self.array.team_counters()
@@ -916,7 +907,7 @@ impl<T: Dist> ActiveMessaging for GlobalLockArray<T> {
 
 impl<T: Dist> LamellarArray<T> for GlobalLockArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
@@ -1087,7 +1078,7 @@ impl<T: Dist + AmDist + 'static> GlobalLockReadGuard<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
+    /// the returned reduction result is only available on the calling PE
     ///
     /// # Safety
     /// the global read lock ensures atomicity of the entire array, i.e. individual elements can not being modified before the call completes
diff --git a/src/array/global_lock_atomic/iteration.rs b/src/array/global_lock_atomic/iteration.rs
index b91669b9..1f458691 100644
--- a/src/array/global_lock_atomic/iteration.rs
+++ b/src/array/global_lock_atomic/iteration.rs
@@ -424,13 +424,13 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockReadGuard<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.array.clone().into(), self.array.team_rt().clone(), 1)
+        OneSidedIter::new(self.array.clone(), self.array.team_rt(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.array.clone().into(),
-            self.array.team_rt().clone(),
+            self.array.clone(),
+            self.array.team_rt(),
             std::cmp::min(buf_size, self.array.len()),
         )
     }
@@ -462,13 +462,13 @@ impl<T: Dist> LamellarArrayIterators<T> for GlobalLockArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.array.clone().into(), self.array.team_rt().clone(), 1)
+        OneSidedIter::new(self.clone(), self.array.team_rt(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.array.clone().into(),
-            self.array.team_rt().clone(),
+            self.clone(),
+            self.array.team_rt(),
             std::cmp::min(buf_size, self.array.len()),
         )
     }
diff --git a/src/array/handle.rs b/src/array/handle.rs
index cdfc9e74..0793ee17 100644
--- a/src/array/handle.rs
+++ b/src/array/handle.rs
@@ -2,6 +2,7 @@ use std::{
     collections::VecDeque,
     future::Future,
     pin::Pin,
+    sync::Arc,
     task::{Context, Poll, Waker},
 };
 
@@ -13,9 +14,11 @@ use crate::{
     lamellar_request::LamellarRequest,
     scheduler::LamellarTask,
     warnings::RuntimeWarning,
-    Dist, OneSidedMemoryRegion, RegisteredMemoryRegion,
+    Dist, LamellarTeamRT, OneSidedMemoryRegion, RegisteredMemoryRegion,
 };
 
+use super::{AtomicArray, GlobalLockArray, LocalLockArray, ReadOnlyArray, UnsafeArray};
+
 /// a task handle for an array rdma (put/get) operation
 pub struct ArrayRdmaHandle {
     pub(crate) array: LamellarByteArray, //prevents prematurely performing a local drop
@@ -200,3 +203,493 @@ impl<T: Dist> Future for ArrayRdmaAtHandle<T> {
         Poll::Ready(unsafe { this.buf.as_slice().expect("Data should exist on PE")[0] })
     }
 }
+
+#[must_use = " Array 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a some other LamellarArray type into an [UnsafeArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Arrays's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to respresented as more than one array type simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+/// let unsafe_array = array.into_unsafe().block();
+/// /* alternatively something like the following is valid as well
+/// let unsafe_array = world.block_on(async move{
+///     array.into_unsafe().await
+/// })
+///  */
+/// ```
+pub struct IntoUnsafeArrayHandle<T> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = UnsafeArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T> PinnedDrop for IntoUnsafeArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoUnsafeArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + 'static> IntoUnsafeArrayHandle<T> {
+    /// Used to drive the cconversion of another LamellarArray into an [UnsafeArray]
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let unsafe_array = array.into_unsafe().block();
+    pub fn block(mut self) -> UnsafeArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "IntoUnsafeArrayHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the conversion  into the UnsafeArray on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = AtomicArray::<usize>::new(&world,100,Distribution::Block).block();
+    /// let unsafe_array_task = array.into_unsafe().spawn();
+    /// let unsafe_array = unsafe_array_task.block();
+    /// ```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<UnsafeArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T> Future for IntoUnsafeArrayHandle<T> {
+    type Output = UnsafeArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        match this.outstanding_future.as_mut().poll(cx) {
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            Poll::Ready(array) => Poll::Ready(array),
+        }
+    }
+}
+
+#[must_use = " Array 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a some other LamellarArray type into an [AtomicArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Arrays's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to respresented as more than one array type simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array = GlobalLockArray::<usize>::new(&world,100, Distribution::Block).block();
+/// let atomic_array = array.into_atomic().block();
+/// /* alternatively something like the following is valid as well
+/// let atomic_array = world.block_on(async move{
+///     array.into_unsafe().await
+/// })
+///  */
+/// ```
+pub struct IntoAtomicArrayHandle<T: Dist> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = AtomicArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for IntoAtomicArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoAtomicArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + 'static> IntoAtomicArrayHandle<T> {
+    /// Used to drive the cconversion of another LamellarArray into an [AtomicArray]
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let array = GlobalLockArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let atomic_array = array.into_atomic().block();
+    pub fn block(mut self) -> AtomicArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "IntoAtomicArrayHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the conversion  into the AtomicArray on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = GlobalLockArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let atomic_array_task = array.into_atomic().spawn();
+    /// let atomic_array = atomic_array_task.block();
+    /// ```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<AtomicArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist> Future for IntoAtomicArrayHandle<T> {
+    type Output = AtomicArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        match this.outstanding_future.as_mut().poll(cx) {
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            Poll::Ready(array) => Poll::Ready(array),
+        }
+    }
+}
+
+#[must_use = " Array 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a some other LamellarArray type into an [LocalLockArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Arrays's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to respresented as more than one array type simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+/// let local_lock_array = array.into_local_lock().block();
+/// /* alternatively something like the following is valid as well
+/// let local_lock_array = world.block_on(async move{
+///     array.into_unsafe().await
+/// })
+///  */
+/// ```
+pub struct IntoLocalLockArrayHandle<T: Dist> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = LocalLockArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for IntoLocalLockArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoLocalLockArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + 'static> IntoLocalLockArrayHandle<T> {
+    /// Used to drive the cconversion of another LamellarArray into an [LocalLockArray]
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let local_lock_array = array.into_local_lock().block();
+    pub fn block(mut self) -> LocalLockArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "IntoLocalLockArrayHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the conversion  into the LocalLockArray on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let local_lock_array_task = array.into_local_lock().spawn();
+    /// let local_lock_array = local_lock_array_task.block();
+    /// ```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<LocalLockArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist> Future for IntoLocalLockArrayHandle<T> {
+    type Output = LocalLockArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        match this.outstanding_future.as_mut().poll(cx) {
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            Poll::Ready(array) => Poll::Ready(array),
+        }
+    }
+}
+
+#[must_use = " Array 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a some other LamellarArray type into an [GlobalLockArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Arrays's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to respresented as more than one array type simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+/// let global_lock_array = array.into_global_lock().block();
+/// /* alternatively something like the following is valid as well
+/// let global_lock_array = world.block_on(async move{
+///     array.into_unsafe().await
+/// })
+///  */
+/// ```
+pub struct IntoGlobalLockArrayHandle<T: Dist> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = GlobalLockArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for IntoGlobalLockArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoGlobalLockArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + 'static> IntoGlobalLockArrayHandle<T> {
+    /// Used to drive the cconversion of another LamellarArray into an [GlobalLockArray]
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let global_lock_array = array.into_global_lock().block();
+    pub fn block(mut self) -> GlobalLockArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "IntoGlobalLockArrayHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the conversion  into the GlobalLockArray on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let global_lock_array_task = array.into_global_lock().spawn();
+    /// let global_lock_array = global_lock_array_task.block();
+    /// ```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<GlobalLockArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist> Future for IntoGlobalLockArrayHandle<T> {
+    type Output = GlobalLockArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        match this.outstanding_future.as_mut().poll(cx) {
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            Poll::Ready(array) => Poll::Ready(array),
+        }
+    }
+}
+
+#[must_use = " Array 'into' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
+#[pin_project(PinnedDrop)]
+#[doc(alias = "Collective")]
+/// This is a handle representing the operation of changing from a some other LamellarArray type into an [ReadOnlyArray].
+/// This handled must either be awaited in an async context or blocked on in a non-async context for the operation to be performed.
+/// Awaiting/blocking on the handle is a blocking collective call amongst all PEs in the Arrays's team, only returning once every PE in the team has completed the call.
+///
+/// Furthermore, the handle will not return while any additional references outside of the one making this call exist on each PE. It is not possible for the
+/// pointed to object to respresented as more than one array type simultaneously (on any PE).
+///
+/// # Collective Operation
+/// Requires all PEs associated with the `darc` to await/block the handle otherwise deadlock will occur (i.e. team barriers are being called internally)
+///
+/// # Examples
+/// ```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+///
+/// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+/// let read_only_array = array.into_read_only().block();
+/// /* alternatively something like the following is valid as well
+/// let read_only_array = world.block_on(async move{
+///     array.into_unsafe().await
+/// })
+///  */
+/// ```
+pub struct IntoReadOnlyArrayHandle<T: Dist> {
+    pub(crate) team: Pin<Arc<LamellarTeamRT>>,
+    pub(crate) launched: bool,
+    #[pin]
+    pub(crate) outstanding_future: Pin<Box<dyn Future<Output = ReadOnlyArray<T>> + Send>>,
+}
+
+#[pinned_drop]
+impl<T: Dist> PinnedDrop for IntoReadOnlyArrayHandle<T> {
+    fn drop(self: Pin<&mut Self>) {
+        if !self.launched {
+            RuntimeWarning::DroppedHandle("a IntoReadOnlyArrayHandle").print();
+        }
+    }
+}
+
+impl<T: Dist + 'static> IntoReadOnlyArrayHandle<T> {
+    /// Used to drive the cconversion of another LamellarArray into an [ReadOnlyArray]
+    ///
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    ///
+    /// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let read_only_array = array.into_read_only().block();
+    pub fn block(mut self) -> ReadOnlyArray<T> {
+        self.launched = true;
+        RuntimeWarning::BlockingCall(
+            "IntoReadOnlyArrayHandle::block",
+            "<handle>.spawn() or <handle>.await",
+        )
+        .print();
+        self.team.clone().block_on(self)
+    }
+
+    /// This method will spawn the conversion  into the ReadOnlyArray on the work queue.
+    ///
+    /// This function returns a handle that can be used to wait for the operation to complete
+    /// # Examples
+    /// ```
+    /// use lamellar::array::prelude::*;
+    ///
+    /// let world = LamellarWorldBuilder::new().build();
+    /// let array = AtomicArray::<usize>::new(&world,100, Distribution::Block).block();
+    /// let read_only_array_task = array.into_read_only().spawn();
+    /// let read_only_array = read_only_array_task.block();
+    /// ```
+    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
+    pub fn spawn(mut self) -> LamellarTask<ReadOnlyArray<T>> {
+        self.launched = true;
+        self.team.clone().spawn(self)
+    }
+}
+
+impl<T: Dist> Future for IntoReadOnlyArrayHandle<T> {
+    type Output = ReadOnlyArray<T>;
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.launched = true;
+        let mut this = self.project();
+        match this.outstanding_future.as_mut().poll(cx) {
+            Poll::Pending => {
+                cx.waker().wake_by_ref();
+                Poll::Pending
+            }
+            Poll::Ready(array) => Poll::Ready(array),
+        }
+    }
+}
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index a1766780..12afce3f 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -4,7 +4,7 @@ use crate::array::iterator::private::*;
 use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::operations::ArrayOps;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
-use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution, TeamInto};
+use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution};
 use crate::barrier::BarrierHandle;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
@@ -193,11 +193,11 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
         array
     }
 
-    fn create_array(&self, local_vals: Vec<T>) -> A {
-        let input = (local_vals, self.distribution);
-        let array: A = TeamInto::team_into(input, &self.team);
-        array
-    }
+    // fn create_array(&self, local_vals: Vec<T>) -> A {
+    //     let input = (local_vals, self.distribution);
+    //     let array: A = TeamInto::team_into(input, &self.team);
+    //     array
+    // }
 }
 
 impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + 'static> Future
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 7f418f52..cd236a16 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -12,6 +12,7 @@ mod rdma;
 use crate::array::private::ArrayExecAm;
 use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
+use crate::array::AsyncFrom;
 use crate::array::*;
 use crate::barrier::BarrierHandle;
 use crate::darc::local_rw_darc::LocalRwDarcWriteGuard;
@@ -422,7 +423,7 @@ impl<T: Dist> LocalLockArray<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Return a handle for accessing the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [LocalLockLocalData], which allows safe immutable access to local elements.
     ///
     /// The returned handle must be await'd `.read_local_data().await` within an async context or
     /// it must be blocked on `.read_local_data().block()` in a non async context to actually acquire the lock
@@ -455,7 +456,7 @@ impl<T: Dist> LocalLockArray<T> {
     }
 
     #[doc(alias("One-sided", "onesided"))]
-    /// Return a handle for accessing the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.   
+    /// Return a handle for accessing the calling PE's local data as a [LocalLockMutLocalData], which allows safe mutable access to local elements.
     ///
     /// The returned handle must be await'd `.write_local_data().await` within an async context or
     /// it must be blocked on `.write_local_data().block()` in a non async context to actually acquire the lock
@@ -513,7 +514,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     ///```
     ///
     /// # Warning
@@ -531,12 +532,17 @@ impl<T: Dist> LocalLockArray<T> {
     /// // but array1 will not be dropped until after 'slice' is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_unsafe" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     /// unsafe_array.print();
     /// println!("{slice:?}");
-    pub fn into_unsafe(self) -> UnsafeArray<T> {
+    pub fn into_unsafe(self) -> IntoUnsafeArrayHandle<T> {
         // println!("locallock into_unsafe");
-        self.array.into()
+        // self.array.into()
+        IntoUnsafeArrayHandle {
+            team: self.array.inner.data.team.clone(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
     // pub fn into_local_only(self) -> LocalOnlyArray<T> {
@@ -562,7 +568,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -579,13 +585,13 @@ impl<T: Dist> LocalLockArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_read_only" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     /// read_only_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_read_only(self) -> ReadOnlyArray<T> {
+    pub fn into_read_only(self) -> IntoReadOnlyArrayHandle<T> {
         // println!("locallock into_read_only");
-        self.array.into()
+        self.array.into_read_only()
     }
 
     #[doc(alias = "Collective")]
@@ -606,7 +612,7 @@ impl<T: Dist> LocalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -623,13 +629,13 @@ impl<T: Dist> LocalLockArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_global_lock" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     /// global_lock_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_global_lock(self) -> GlobalLockArray<T> {
+    pub fn into_global_lock(self) -> IntoGlobalLockArrayHandle<T> {
         // println!("readonly into_global_lock");
-        self.array.into()
+        self.array.into_global_lock()
     }
 }
 
@@ -652,7 +658,7 @@ impl<T: Dist + 'static> LocalLockArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: LocalLockArray<usize> = LocalLockArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let atomic_array = array.into_atomic();
+    /// let atomic_array = array.into_atomic().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -669,24 +675,30 @@ impl<T: Dist + 'static> LocalLockArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_atomic" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let atomic_array = array.into_atomic();
+    /// let atomic_array = array.into_atomic().block();
     /// atomic_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_atomic(self) -> AtomicArray<T> {
+    pub fn into_atomic(self) -> IntoAtomicArrayHandle<T> {
         // println!("locallock into_atomic");
-        self.array.into()
+        self.array.into_atomic()
+        // IntoAtomicArrayHandle {
+        //     array: self.array.clone(),
+        //     team: self.array.team_rt(),
+        //     launched: false,
+        //     outstanding_future: Box::pin(self.array.async_into()),
+        // }
     }
 }
 
-impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
-    fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let (vals, distribution) = input;
-        let input = (&vals, distribution);
-        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-        array.into()
-    }
-}
+// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
+//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         let (vals, distribution) = input;
+//         let input = (&vals, distribution);
+//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
+//         array.into()
+//     }
+// }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
@@ -696,19 +708,6 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for LocalLockArra
     }
 }
 
-impl<T: Dist> From<UnsafeArray<T>> for LocalLockArray<T> {
-    fn from(array: UnsafeArray<T>) -> Self {
-        // println!("locallock from unsafe");
-        array.block_on_outstanding(DarcMode::LocalLockArray);
-        let lock = LocalRwDarc::new(array.team_rt(), ()).block().unwrap();
-
-        LocalLockArray {
-            lock: lock,
-            array: array,
-        }
-    }
-}
-
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
@@ -730,26 +729,26 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
 //     }
 // }
 
-impl<T: Dist> From<AtomicArray<T>> for LocalLockArray<T> {
-    fn from(array: AtomicArray<T>) -> Self {
-        // println!("locallock from atomic");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<AtomicArray<T>> for LocalLockArray<T> {
+//     fn from(array: AtomicArray<T>) -> Self {
+//         // println!("locallock from atomic");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
-impl<T: Dist> From<ReadOnlyArray<T>> for LocalLockArray<T> {
-    fn from(array: ReadOnlyArray<T>) -> Self {
-        // println!("locallock from readonly");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<ReadOnlyArray<T>> for LocalLockArray<T> {
+//     fn from(array: ReadOnlyArray<T>) -> Self {
+//         // println!("locallock from readonly");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
-impl<T: Dist> From<GlobalLockArray<T>> for LocalLockArray<T> {
-    fn from(array: GlobalLockArray<T>) -> Self {
-        // println!("LocalLockArray from GlobalLockArray");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<GlobalLockArray<T>> for LocalLockArray<T> {
+//     fn from(array: GlobalLockArray<T>) -> Self {
+//         // println!("LocalLockArray from GlobalLockArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
 impl<T: Dist> From<LocalLockArray<T>> for LocalLockByteArray {
     fn from(array: LocalLockArray<T>) -> Self {
@@ -789,7 +788,7 @@ impl<T: Dist> From<LocalLockByteArray> for LocalLockArray<T> {
 
 impl<T: Dist> private::ArrayExecAm<T> for LocalLockArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
         self.array.team_counters()
@@ -876,7 +875,7 @@ impl<T: Dist> ActiveMessaging for LocalLockArray<T> {
 
 impl<T: Dist> LamellarArray<T> for LocalLockArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
@@ -1042,7 +1041,7 @@ impl<T: Dist + AmDist + 'static> LocalLockReadGuard<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
+    /// the returned reduction result is only available on the calling PE
     ///
     /// # Safety
     /// the local read lock ensures atomicity of only the local portion of the array, I.e. elements on a PE wont change while the operation is being executed on that PE
diff --git a/src/array/local_lock_atomic/iteration.rs b/src/array/local_lock_atomic/iteration.rs
index d99401e6..13f55374 100644
--- a/src/array/local_lock_atomic/iteration.rs
+++ b/src/array/local_lock_atomic/iteration.rs
@@ -458,13 +458,13 @@ impl<T: Dist> LamellarArrayIterators<T> for LocalLockArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.clone().into(), self.array.team_rt().clone(), 1)
+        OneSidedIter::new(self.clone(), self.array.team_rt(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.clone().into(),
-            self.array.team_rt().clone(),
+            self.clone(),
+            self.array.team_rt(),
             std::cmp::min(buf_size, self.len()),
         )
     }
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index c2ee7c3b..7e035adf 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1001,25 +1001,31 @@ impl<T: Dist> NativeAtomicArray<T> {
         self.array.local_as_mut_slice()
     }
 
-    pub fn into_unsafe(self) -> UnsafeArray<T> {
+    pub fn into_unsafe(self) -> IntoUnsafeArrayHandle<T> {
         // println!("native into_unsafe");
-        self.array.into()
+        // self.array.into()
+
+        IntoUnsafeArrayHandle {
+            team: self.array.inner.data.team.clone(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
-    pub fn into_read_only(self) -> ReadOnlyArray<T> {
+    pub fn into_read_only(self) -> IntoReadOnlyArrayHandle<T> {
         // println!("native into_read_only");
-        self.array.into()
+        self.array.into_read_only()
     }
 }
 
-impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
-    fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let (vals, distribution) = input;
-        let input = (&vals, distribution);
-        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-        array.into()
-    }
-}
+// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
+//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         let (vals, distribution) = input;
+//         let input = (&vals, distribution);
+//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
+//         array.into()
+//     }
+// }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
@@ -1030,17 +1036,17 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicA
 }
 
 //#[doc(hidden)]
-impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
-    fn from(array: UnsafeArray<T>) -> Self {
-        // println!("native from unsafe");
-        array.block_on_outstanding(DarcMode::NativeAtomicArray);
-
-        NativeAtomicArray {
-            array: array,
-            orig_t: NativeAtomicType::of::<T>(),
-        }
-    }
-}
+// impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
+//     fn from(array: UnsafeArray<T>) -> Self {
+//         // println!("native from unsafe");
+//         array.block_on_outstanding(DarcMode::NativeAtomicArray);
+
+//         NativeAtomicArray {
+//             array: array,
+//             orig_t: NativeAtomicType::of::<T>(),
+//         }
+//     }
+// }
 
 //#[doc(hidden)]
 #[async_trait]
@@ -1123,7 +1129,7 @@ impl<T: Dist> From<NativeAtomicByteArray> for AtomicArray<T> {
 // //#[doc(hidden)]
 impl<T: Dist> private::ArrayExecAm<T> for NativeAtomicArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
         self.array.team_counters()
@@ -1212,7 +1218,7 @@ impl<T: Dist> ActiveMessaging for NativeAtomicArray<T> {
 //#[doc(hidden)]
 impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
diff --git a/src/array/native_atomic/iteration.rs b/src/array/native_atomic/iteration.rs
index 85228558..5a52f1c0 100644
--- a/src/array/native_atomic/iteration.rs
+++ b/src/array/native_atomic/iteration.rs
@@ -194,13 +194,13 @@ impl<T: Dist> LamellarArrayIterators<T> for NativeAtomicArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.clone().into(), self.array.team_rt().clone(), 1)
+        OneSidedIter::new(self.clone(), self.array.team_rt(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.clone().into(),
-            self.array.team_rt().clone(),
+            self.clone(),
+            self.array.team_rt(),
             std::cmp::min(buf_size, self.len()),
         )
     }
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index 25a90c94..b0969e2f 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -176,7 +176,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     ///```
     ///
     /// # Warning
@@ -194,13 +194,17 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// // but array1 will not be dropped until after 'slice' is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_unsafe" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let unsafe_array = array.into_unsafe();
+    /// let unsafe_array = array.into_unsafe().block();
     /// unsafe_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_unsafe(self) -> UnsafeArray<T> {
+    pub fn into_unsafe(self) -> IntoUnsafeArrayHandle<T> {
         // println!("readonly into_unsafe");
-        self.array.into()
+        IntoUnsafeArrayHandle {
+            team: self.array.inner.data.team.clone(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
     // pub fn into_local_only(self) -> LocalOnlyArray<T> {
@@ -226,7 +230,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let local_lock_array = array.into_local_lock();
+    /// let local_lock_array = array.into_local_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -243,13 +247,13 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_local_lock" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let local_lock_array = array.into_local_lock();
+    /// let local_lock_array = array.into_local_lock().block();
     /// local_lock_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_local_lock(self) -> LocalLockArray<T> {
+    pub fn into_local_lock(self) -> IntoLocalLockArrayHandle<T> {
         // println!("readonly into_local_lock");
-        self.array.into()
+        self.array.into_local_lock()
     }
 
     #[doc(alias = "Collective")]
@@ -270,7 +274,7 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -287,13 +291,13 @@ impl<T: Dist + ArrayOps> ReadOnlyArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_global_lock" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     /// global_lock_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_global_lock(self) -> GlobalLockArray<T> {
+    pub fn into_global_lock(self) -> IntoGlobalLockArrayHandle<T> {
         // println!("readonly into_global_lock");
-        self.array.into()
+        self.array.into_global_lock()
     }
 }
 
@@ -316,7 +320,7 @@ impl<T: Dist + 'static> ReadOnlyArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: ReadOnlyArray<usize> = ReadOnlyArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let atomic_array = array.into_local_lock();
+    /// let atomic_array = array.into_local_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -333,23 +337,23 @@ impl<T: Dist + 'static> ReadOnlyArray<T> {
     /// // but array1 will not be dropped until after slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_atomic" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let atomic_array = array.into_local_lock();
+    /// let atomic_array = array.into_local_lock().block();
     /// atomic_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_atomic(self) -> AtomicArray<T> {
-        self.array.into()
+    pub fn into_atomic(self) -> IntoAtomicArrayHandle<T> {
+        self.array.into_atomic()
     }
 }
 
-impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
-    fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let (vals, distribution) = input;
-        let input = (&vals, distribution);
-        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-        array.into()
-    }
-}
+// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
+//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         let (vals, distribution) = input;
+//         let input = (&vals, distribution);
+//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
+//         array.into()
+//     }
+// }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
@@ -359,21 +363,21 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray
     }
 }
 
-impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for ReadOnlyArray<T> {
-    fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-        array.into()
-    }
-}
+// impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for ReadOnlyArray<T> {
+//     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
+//         array.into()
+//     }
+// }
 
-impl<T: Dist> From<UnsafeArray<T>> for ReadOnlyArray<T> {
-    fn from(array: UnsafeArray<T>) -> Self {
-        // println!("readonly from UnsafeArray");
-        array.block_on_outstanding(DarcMode::ReadOnlyArray);
+// impl<T: Dist> From<UnsafeArray<T>> for ReadOnlyArray<T> {
+//     fn from(array: UnsafeArray<T>) -> Self {
+//         // println!("readonly from UnsafeArray");
+//         array.block_on_outstanding(DarcMode::ReadOnlyArray);
 
-        ReadOnlyArray { array: array }
-    }
-}
+//         ReadOnlyArray { array: array }
+//     }
+// }
 
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
@@ -392,26 +396,26 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
 //     }
 // }
 
-impl<T: Dist> From<AtomicArray<T>> for ReadOnlyArray<T> {
-    fn from(array: AtomicArray<T>) -> Self {
-        // println!("readonly from AtomicArray");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<AtomicArray<T>> for ReadOnlyArray<T> {
+//     fn from(array: AtomicArray<T>) -> Self {
+//         // println!("readonly from AtomicArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
-impl<T: Dist> From<LocalLockArray<T>> for ReadOnlyArray<T> {
-    fn from(array: LocalLockArray<T>) -> Self {
-        // println!("readonly from LocalLockArray");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<LocalLockArray<T>> for ReadOnlyArray<T> {
+//     fn from(array: LocalLockArray<T>) -> Self {
+//         // println!("readonly from LocalLockArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
-impl<T: Dist> From<GlobalLockArray<T>> for ReadOnlyArray<T> {
-    fn from(array: GlobalLockArray<T>) -> Self {
-        // println!("readonly from GlobalLockArray");
-        unsafe { array.into_inner().into() }
-    }
-}
+// impl<T: Dist> From<GlobalLockArray<T>> for ReadOnlyArray<T> {
+//     fn from(array: GlobalLockArray<T>) -> Self {
+//         // println!("readonly from GlobalLockArray");
+//         unsafe { array.into_inner().into() }
+//     }
+// }
 
 impl<T: Dist> From<ReadOnlyArray<T>> for ReadOnlyByteArray {
     fn from(array: ReadOnlyArray<T>) -> Self {
@@ -454,7 +458,7 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
+    /// the returned reduction result is only available on the calling PE
     /// # Note
     /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
@@ -471,7 +475,7 @@ impl<T: Dist + AmDist + 'static> ReadOnlyArray<T> {
     ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     /// }).block();
     /// array.wait_all();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
     /// let sum = array.block_on(array.reduce("sum")).expect("array len > 0"); // equivalent to calling array.sum()
     /// assert_eq!(array.len()*num_pes,sum);
     ///```
@@ -504,7 +508,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     ///     let _ = array_clone.add(index,1).spawn(); //randomly at one to an element in the array.
     /// }).block();
     /// array.wait_all();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
     /// let sum = array.block_on(array.sum()).expect("array len > 0");
     /// assert_eq!(array.len()*num_pes,sum);
     /// ```
@@ -533,7 +537,7 @@ impl<T: Dist + AmDist + ElementArithmeticOps + 'static> ReadOnlyArray<T> {
     ///     elem.store(i+1);
     /// }).block();
     /// array.wait_all();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
     /// let prod =  array.block_on(array.prod()).expect("array len > 0");
     /// assert_eq!((1..=array.len()).product::<usize>(),prod);
     ///```
@@ -561,7 +565,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
     /// array.wait_all();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
     /// let max = array.block_on(array.max()).expect("array len > 0");
     /// assert_eq!((array.len()-1)*2,max);
     ///```
@@ -588,7 +592,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
     /// let array = AtomicArray::<usize>::new(&world,10,Distribution::Block).block();
     /// let _ = array.dist_iter().enumerate().for_each(move |(i,elem)| elem.store(i*2)).block();
     /// array.wait_all();
-    /// let array = array.into_read_only(); //only returns once there is a single reference remaining on each PE
+    /// let array = array.into_read_only().block(); //only returns once there is a single reference remaining on each PE
     /// let min = array.block_on(array.min()).expect("array len > 0");
     /// assert_eq!(0,min);
     ///```
@@ -600,7 +604,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
 
 impl<T: Dist> private::ArrayExecAm<T> for ReadOnlyArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
         self.array.team_counters()
@@ -687,7 +691,7 @@ impl<T: Dist> ActiveMessaging for ReadOnlyArray<T> {
 
 impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
     fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt().clone()
+        self.array.team_rt()
     }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
diff --git a/src/array/read_only/iteration.rs b/src/array/read_only/iteration.rs
index 5c83f673..9eee8def 100644
--- a/src/array/read_only/iteration.rs
+++ b/src/array/read_only/iteration.rs
@@ -27,13 +27,13 @@ impl<T: Dist> LamellarArrayIterators<T> for ReadOnlyArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.clone().into(), self.array.team_rt().clone(), 1)
+        OneSidedIter::new(self.clone(), self.array.team_rt(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
-            self.clone().into(),
-            self.array.team_rt().clone(),
+            self.clone(),
+            self.array.team_rt(),
             std::cmp::min(buf_size, self.len()),
         )
     }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index f1c42eba..cd0cb4b3 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -526,7 +526,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let read_only_array = array.into_read_only();
+    /// let read_only_array = array.into_read_only().block();
     ///```
     ///
     /// # Warning
@@ -544,13 +544,17 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_read_only" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop mut_slice/array1).
-    /// let ro_array = array.into_read_only();
+    /// let ro_array = array.into_read_only().block();
     /// ro_array.print();
     /// println!("{mut_slice:?}");
     ///```
-    pub fn into_read_only(self) -> ReadOnlyArray<T> {
+    pub fn into_read_only(self) -> IntoReadOnlyArrayHandle<T> {
         // println!("unsafe into read only");
-        self.into()
+        IntoReadOnlyArrayHandle {
+            team: self.team_rt(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
     // pub fn into_local_only(self) -> LocalOnlyArray<T> {
@@ -576,7 +580,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let local_lock_array = array.into_local_lock();
+    /// let local_lock_array = array.into_local_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -593,13 +597,19 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "iinto_local_lock" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop mut_slice/array1).
-    /// let local_lock_array = array.into_local_lock();
+    /// let local_lock_array = array.into_local_lock().block();
     /// local_lock_array.print();
     /// println!("{mut_slice:?}");
     ///```
-    pub fn into_local_lock(self) -> LocalLockArray<T> {
+    pub fn into_local_lock(self) -> IntoLocalLockArrayHandle<T> {
         // println!("unsafe into local lock atomic");
-        self.into()
+        // self.into()
+
+        IntoLocalLockArrayHandle {
+            team: self.team_rt(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
     #[doc(alias = "Collective")]
@@ -620,7 +630,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -637,13 +647,17 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_global_lock" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop slice/array1).
-    /// let global_lock_array = array.into_global_lock();
+    /// let global_lock_array = array.into_global_lock().block();
     /// global_lock_array.print();
     /// println!("{slice:?}");
     ///```
-    pub fn into_global_lock(self) -> GlobalLockArray<T> {
+    pub fn into_global_lock(self) -> IntoGlobalLockArrayHandle<T> {
         // println!("readonly into_global_lock");
-        self.into()
+        IntoGlobalLockArrayHandle {
+            team: self.team_rt(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 
     pub(crate) fn tasking_barrier(&self) {
@@ -674,7 +688,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// let my_pe = world.my_pe();
     /// let array: UnsafeArray<usize> = UnsafeArray::new(&world,100,Distribution::Cyclic).block();
     ///
-    /// let atomic_array = array.into_local_lock();
+    /// let atomic_array = array.into_local_lock().block();
     ///```
     /// # Warning
     /// Because this call blocks there is the possibility for deadlock to occur, as highlighted below:
@@ -691,13 +705,17 @@ impl<T: Dist + 'static> UnsafeArray<T> {
     /// // but array1 will not be dropped until after mut_slice is dropped.
     /// // Given the ordering of these calls we will get stuck in "into_atomic" as it
     /// // waits for the reference count to go down to "1" (but we will never be able to drop mut_slice/array1).
-    /// let atomic_array = array.into_local_lock();
+    /// let atomic_array = array.into_local_lock().block();
     /// atomic_array.print();
     /// println!("{mut_slice:?}");
     ///```
-    pub fn into_atomic(self) -> AtomicArray<T> {
+    pub fn into_atomic(self) -> IntoAtomicArrayHandle<T> {
         // println!("unsafe into atomic");
-        self.into()
+        IntoAtomicArrayHandle {
+            team: self.team_rt(),
+            launched: false,
+            outstanding_future: Box::pin(self.async_into()),
+        }
     }
 }
 
@@ -804,46 +822,76 @@ impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
     }
 }
 
-impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
-    fn from(array: AtomicArray<T>) -> Self {
+// impl<T: Dist> From<AtomicArray<T>> for UnsafeArray<T> {
+//     fn from(array: AtomicArray<T>) -> Self {
+//         match array {
+//             AtomicArray::NativeAtomicArray(array) => UnsafeArray::<T>::from(array),
+//             AtomicArray::GenericAtomicArray(array) => UnsafeArray::<T>::from(array),
+//         }
+//     }
+// }
+
+#[async_trait]
+impl<T: Dist> AsyncFrom<AtomicArray<T>> for UnsafeArray<T> {
+    async fn async_from(array: AtomicArray<T>) -> Self {
         match array {
-            AtomicArray::NativeAtomicArray(array) => UnsafeArray::<T>::from(array),
-            AtomicArray::GenericAtomicArray(array) => UnsafeArray::<T>::from(array),
+            AtomicArray::NativeAtomicArray(array) => UnsafeArray::<T>::async_from(array).await,
+            AtomicArray::GenericAtomicArray(array) => UnsafeArray::<T>::async_from(array).await,
         }
     }
 }
 
-impl<T: Dist> From<NativeAtomicArray<T>> for UnsafeArray<T> {
-    fn from(array: NativeAtomicArray<T>) -> Self {
-        array.array.block_on_outstanding(DarcMode::UnsafeArray);
+#[async_trait]
+impl<T: Dist> AsyncFrom<NativeAtomicArray<T>> for UnsafeArray<T> {
+    async fn async_from(array: NativeAtomicArray<T>) -> Self {
+        array
+            .array
+            .await_on_outstanding(DarcMode::UnsafeArray)
+            .await;
         array.array
     }
 }
 
-impl<T: Dist> From<GenericAtomicArray<T>> for UnsafeArray<T> {
-    fn from(array: GenericAtomicArray<T>) -> Self {
-        array.array.block_on_outstanding(DarcMode::UnsafeArray);
+#[async_trait]
+impl<T: Dist> AsyncFrom<GenericAtomicArray<T>> for UnsafeArray<T> {
+    async fn async_from(array: GenericAtomicArray<T>) -> Self {
+        array
+            .array
+            .await_on_outstanding(DarcMode::UnsafeArray)
+            .await;
         array.array
     }
 }
 
-impl<T: Dist> From<LocalLockArray<T>> for UnsafeArray<T> {
-    fn from(array: LocalLockArray<T>) -> Self {
-        array.array.block_on_outstanding(DarcMode::UnsafeArray);
+#[async_trait]
+impl<T: Dist> AsyncFrom<LocalLockArray<T>> for UnsafeArray<T> {
+    async fn async_from(array: LocalLockArray<T>) -> Self {
+        array
+            .array
+            .await_on_outstanding(DarcMode::UnsafeArray)
+            .await;
         array.array
     }
 }
 
-impl<T: Dist> From<GlobalLockArray<T>> for UnsafeArray<T> {
-    fn from(array: GlobalLockArray<T>) -> Self {
-        array.array.block_on_outstanding(DarcMode::UnsafeArray);
+#[async_trait]
+impl<T: Dist> AsyncFrom<GlobalLockArray<T>> for UnsafeArray<T> {
+    async fn async_from(array: GlobalLockArray<T>) -> Self {
+        array
+            .array
+            .await_on_outstanding(DarcMode::UnsafeArray)
+            .await;
         array.array
     }
 }
 
-impl<T: Dist> From<ReadOnlyArray<T>> for UnsafeArray<T> {
-    fn from(array: ReadOnlyArray<T>) -> Self {
-        array.array.block_on_outstanding(DarcMode::UnsafeArray);
+#[async_trait]
+impl<T: Dist> AsyncFrom<ReadOnlyArray<T>> for UnsafeArray<T> {
+    async fn async_from(array: ReadOnlyArray<T>) -> Self {
+        array
+            .array
+            .await_on_outstanding(DarcMode::UnsafeArray)
+            .await;
         array.array
     }
 }
@@ -898,7 +946,7 @@ impl<T: Dist> From<LamellarByteArray> for UnsafeArray<T> {
 
 impl<T: Dist> ArrayExecAm<T> for UnsafeArray<T> {
     fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.team_rt().clone()
+        self.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
         self.inner.data.array_counters.clone()
@@ -1346,7 +1394,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Reduce` active messages on the other PEs associated with the array.
-    /// the returned reduction result is only available on the calling PE  
+    /// the returned reduction result is only available on the calling PE
     /// # Note
     /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
@@ -1384,7 +1432,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Sum` active messages on the other PEs associated with the array.
-    /// the returned sum reduction result is only available on the calling PE  
+    /// the returned sum reduction result is only available on the calling PE
     /// # Note
     /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
@@ -1422,7 +1470,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Prod` active messages on the other PEs associated with the array.
-    /// the returned prod reduction result is only available on the calling PE  
+    /// the returned prod reduction result is only available on the calling PE
     /// # Note
     /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
@@ -1459,7 +1507,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Max` active messages on the other PEs associated with the array.
-    /// the returned max reduction result is only available on the calling PE  
+    /// the returned max reduction result is only available on the calling PE
     /// # Note
     /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
@@ -1492,7 +1540,7 @@ impl<T: Dist + AmDist + 'static> UnsafeArray<T> {
     ///
     /// # One-sided Operation
     /// The calling PE is responsible for launching `Min` active messages on the other PEs associated with the array.
-    /// the returned min reduction result is only available on the calling PE  
+    /// the returned min reduction result is only available on the calling PE
     /// # Note
     /// The future retuned by this function is lazy and does nothing unless awaited, [spawned][AmHandle::spawn] or [blocked on][AmHandle::block]
     /// # Examples
diff --git a/src/array/unsafe/iteration.rs b/src/array/unsafe/iteration.rs
index d3f9e4ad..8e091c00 100644
--- a/src/array/unsafe/iteration.rs
+++ b/src/array/unsafe/iteration.rs
@@ -147,7 +147,7 @@ impl<T: Dist> UnsafeArray<T> {
     /// }
     ///```
     pub unsafe fn onesided_iter(&self) -> OneSidedIter<'_, T, UnsafeArray<T>> {
-        OneSidedIter::new(self.clone().into(), self.inner.data.team.clone(), 1)
+        OneSidedIter::new(self.clone(), self.inner.data.team.clone(), 1)
     }
 
     #[doc(alias("One-sided", "onesided"))]
@@ -186,7 +186,7 @@ impl<T: Dist> UnsafeArray<T> {
         buf_size: usize,
     ) -> OneSidedIter<'_, T, UnsafeArray<T>> {
         OneSidedIter::new(
-            self.clone().into(),
+            self.clone(),
             self.inner.data.team.clone(),
             std::cmp::min(buf_size, self.len()),
         )
diff --git a/src/darc/handle.rs b/src/darc/handle.rs
index 822b7415..626c8c1a 100644
--- a/src/darc/handle.rs
+++ b/src/darc/handle.rs
@@ -783,7 +783,7 @@ impl<T: 'static> PinnedDrop for IntoDarcHandle<T> {
 }
 
 impl<T: Sync + Send> IntoDarcHandle<T> {
-    /// Used to drive to conversion of a [LocalRwDarc] or [GlobalRwDarc] into a [Darc]
+    /// Used to drive the cconversion of a [LocalRwDarc] or [GlobalRwDarc] into a [Darc]
     /// # Examples
     ///
     ///```
@@ -884,7 +884,7 @@ impl<T: 'static> PinnedDrop for IntoLocalRwDarcHandle<T> {
 }
 
 impl<T: Sync + Send> IntoLocalRwDarcHandle<T> {
-    /// Used to drive to conversion of a [Darc] or [GlobalRwDarc] into a [LocalRwDarc]
+    /// Used to drive the cconversion of a [Darc] or [GlobalRwDarc] into a [LocalRwDarc]
     /// # Examples
     ///
     ///```
@@ -987,7 +987,7 @@ impl<T: 'static> PinnedDrop for IntoGlobalRwDarcHandle<T> {
 }
 
 impl<T: Sync + Send> IntoGlobalRwDarcHandle<T> {
-    /// Used to drive to conversion of a  [Darc] or [LocalRwDarc] into a [GlobalRwDarc]
+    /// Used to drive the cconversion of a  [Darc] or [LocalRwDarc] into a [GlobalRwDarc]
     /// # Examples
     ///
     ///```
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 456a9d9d..93c4c7f0 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -15,8 +15,8 @@ use crate::memregion::{
 use crate::scheduler::{LamellarTask, ReqId, Scheduler};
 use crate::warnings::RuntimeWarning;
 
-#[cfg(feature = "nightly")]
-use crate::utils::ser_closure;
+// #[cfg(feature = "nightly")]
+// use crate::utils::ser_closure;
 
 // use log::trace;
 use std::collections::hash_map::DefaultHasher;
@@ -574,7 +574,10 @@ impl ActiveMessaging for Arc<LamellarTeam> {
 impl RemoteMemoryRegion for Arc<LamellarTeam> {
     //#[tracing::instrument(skip_all)]
 
-    fn try_alloc_shared_mem_region<T: Dist>(&self, size: usize) -> FallibleSharedMemoryRegionHandle<T> {
+    fn try_alloc_shared_mem_region<T: Dist>(
+        &self,
+        size: usize,
+    ) -> FallibleSharedMemoryRegionHandle<T> {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         // self.team.barrier.barrier();
@@ -617,10 +620,7 @@ impl RemoteMemoryRegion for Arc<LamellarTeam> {
     }
 
     //#[tracing::instrument(skip_all)]
-    fn alloc_one_sided_mem_region<T: Dist>(
-        &self,
-        size: usize,
-    ) -> OneSidedMemoryRegion<T> {
+    fn alloc_one_sided_mem_region<T: Dist>(&self, size: usize) -> OneSidedMemoryRegion<T> {
         assert!(self.panic.load(Ordering::SeqCst) == 0);
 
         let mut lmr = OneSidedMemoryRegion::try_new(size, &self.team, self.team.lamellae.clone());
@@ -1439,9 +1439,10 @@ impl LamellarTeamRT {
                 || orig_launched != self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
                 || (self.parent.is_none()
                     && (self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
-                    || world_orig_reqs != self.world_counters.send_req_cnt.load(Ordering::SeqCst)
-                    || world_orig_launched != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
-                ))
+                        || world_orig_reqs
+                            != self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                        || world_orig_launched
+                            != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))))
         {
             orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
             orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
@@ -2228,7 +2229,7 @@ impl LamellarTeamRT {
         OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone())
     }
 
-     /// allocate a local memory region from the asymmetric heap
+    /// allocate a local memory region from the asymmetric heap
     ///
     /// # Arguments
     ///
@@ -2238,7 +2239,7 @@ impl LamellarTeamRT {
     pub(crate) fn alloc_one_sided_mem_region<T: Dist>(
         self: &Pin<Arc<LamellarTeamRT>>,
         size: usize,
-    ) -> OneSidedMemoryRegion<T>{
+    ) -> OneSidedMemoryRegion<T> {
         let mut lmr = OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone());
         while let Err(_err) = lmr {
             std::thread::yield_now();
diff --git a/src/utils.rs b/src/utils.rs
index f56cd763..35410fb8 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,9 +1,9 @@
 use std::any;
 
-#[cfg(feature = "SocketsBackend")]
-use crate::runtime::Arch;
-#[cfg(feature = "SocketsBackend")]
-use std::env;
+// #[cfg(feature = "SocketsBackend")]
+// use crate::runtime::Arch;
+// #[cfg(feature = "SocketsBackend")]
+// use std::env;
 
 #[doc(hidden)]
 #[allow(dead_code)]
@@ -13,126 +13,126 @@ pub fn print_type_of<T>(_: &T) {
 
 // serialize the trait object F
 // #[flame]
-#[cfg(feature = "nightly")]
-pub(crate) fn ser_closure<
-    F: FnOnce() -> T + serde::ser::Serialize + serde::de::DeserializeOwned + 'static,
-    T: any::Any + serde::ser::Serialize + serde::de::DeserializeOwned,
->(
-    start: F,
-) -> Vec<u8> {
-    let arg: Vec<u8> = crate::serialize(&start).unwrap();
-    let start: serde_closure::FnOnce<(Vec<u8>,), fn((Vec<u8>,), ()) -> _> = FnOnce!([arg]move||{
-        let arg: Vec<u8> = arg;
-        let closure: F = crate::deserialize(&arg).unwrap();
-        closure()
-    });
-    crate::serialize(&start).unwrap()
-}
+// #[cfg(feature = "nightly")]
+// pub(crate) fn ser_closure<
+//     F: FnOnce() -> T + serde::ser::Serialize + serde::de::DeserializeOwned + 'static,
+//     T: any::Any + serde::ser::Serialize + serde::de::DeserializeOwned,
+// >(
+//     start: F,
+// ) -> Vec<u8> {
+//     let arg: Vec<u8> = crate::serialize(&start).unwrap();
+//     let start: serde_closure::FnOnce<(Vec<u8>,), fn((Vec<u8>,), ()) -> _> = FnOnce!([arg]move||{
+//         let arg: Vec<u8> = arg;
+//         let closure: F = crate::deserialize(&arg).unwrap();
+//         closure()
+//     });
+//     crate::serialize(&start).unwrap()
+// }
 
-#[cfg(feature = "SocketsBackend")]
-pub(crate) fn parse_slurm() -> Arch {
-    //--------------parse slurm environment variables-----------------
-    let num_locales = match env::var("SLURM_NNODES") {
-        Ok(val) => val.parse::<usize>().unwrap(),
-        Err(_e) => {
-            println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
-            1
-        }
-    };
+// #[cfg(feature = "SocketsBackend")]
+// pub(crate) fn parse_slurm() -> Arch {
+//     //--------------parse slurm environment variables-----------------
+//     let num_locales = match env::var("SLURM_NNODES") {
+//         Ok(val) => val.parse::<usize>().unwrap(),
+//         Err(_e) => {
+//             println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
+//             1
+//         }
+//     };
 
-    let num_pes = match env::var("PMI_SIZE") {
-        Ok(val) => val.parse::<usize>().unwrap(),
-        Err(_e) => {
-            println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
-            1
-        }
-    };
+//     let num_pes = match env::var("PMI_SIZE") {
+//         Ok(val) => val.parse::<usize>().unwrap(),
+//         Err(_e) => {
+//             println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
+//             1
+//         }
+//     };
 
-    let my_pe = match env::var("PMI_RANK") {
-        Ok(val) => val.parse::<usize>().unwrap(),
-        Err(_e) => {
-            println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
-            0
-        }
-    };
+//     let my_pe = match env::var("PMI_RANK") {
+//         Ok(val) => val.parse::<usize>().unwrap(),
+//         Err(_e) => {
+//             println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
+//             0
+//         }
+//     };
 
-    let job_id = match env::var("SLURM_JOBID") {
-        Ok(val) => val.parse::<usize>().unwrap(),
-        Err(_e) => 1,
-    };
+//     let job_id = match env::var("SLURM_JOBID") {
+//         Ok(val) => val.parse::<usize>().unwrap(),
+//         Err(_e) => 1,
+//     };
 
-    let my_name = match env::var("SLURMD_NODENAME") {
-        Ok(val) => val,
-        Err(_e) => {
-            println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
-            String::from("localhost")
-        }
-    };
+//     let my_name = match env::var("SLURMD_NODENAME") {
+//         Ok(val) => val,
+//         Err(_e) => {
+//             println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
+//             String::from("localhost")
+//         }
+//     };
 
-    let nodes = match env::var("SLURM_NODELIST") {
-        Ok(val) => val,
-        Err(_e) => {
-            println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
-            String::from("localhost")
-        }
-    };
+//     let nodes = match env::var("SLURM_NODELIST") {
+//         Ok(val) => val,
+//         Err(_e) => {
+//             println!("[LAMELLAR] WARNING: currently only supports slurm envrionments, falling back to single node");
+//             String::from("localhost")
+//         }
+//     };
 
-    let split = nodes.split(']').collect::<Vec<&str>>()[0]
-        .split('[')
-        .collect::<Vec<&str>>();
+//     let split = nodes.split(']').collect::<Vec<&str>>()[0]
+//         .split('[')
+//         .collect::<Vec<&str>>();
 
-    let base = split[0];
-    let mut pe_addrs: Vec<_> = Vec::new();
-    let num_pe_node = num_pes / num_locales;
+//     let base = split[0];
+//     let mut pe_addrs: Vec<_> = Vec::new();
+//     let num_pe_node = num_pes / num_locales;
 
-    if split.len() > 1 {
-        for node_str in split[1].split(',') {
-            let node_range = node_str.split('-').collect::<Vec<&str>>();
-            if node_range.len() > 1 {
-                for i in node_range[0].parse::<usize>().unwrap()
-                    ..=node_range[1].parse::<usize>().unwrap()
-                //..= is inclusive range
-                {
-                    let nn = format!("{:0width$}", i, width = node_range[0].len());
-                    let tmp = [base, &nn[..]].concat();
-                    if tmp == my_name {
-                        // my_pe = pe_cnt;
-                    }
-                    for _i in 0..num_pe_node {
-                        pe_addrs.push(tmp.clone());
-                        // pe_cnt += 1;
-                    }
-                }
-            } else {
-                let tmp = [base, node_range[0]].concat();
-                if tmp == my_name {
-                    // my_pe = pe_cnt;
-                }
-                for _i in 0..num_pe_node {
-                    pe_addrs.push(tmp.clone());
-                    // pe_cnt += 1;
-                }
-            }
-        }
-    } else {
-        for _i in 0..num_pe_node {
-            pe_addrs.push("localhost".to_string());
-        }
-    }
-    Arch {
-        my_pe: my_pe,
-        num_pes: num_pes,
-        pe_addrs: pe_addrs,
-        job_id: job_id,
-    }
-}
+//     if split.len() > 1 {
+//         for node_str in split[1].split(',') {
+//             let node_range = node_str.split('-').collect::<Vec<&str>>();
+//             if node_range.len() > 1 {
+//                 for i in node_range[0].parse::<usize>().unwrap()
+//                     ..=node_range[1].parse::<usize>().unwrap()
+//                 //..= is inclusive range
+//                 {
+//                     let nn = format!("{:0width$}", i, width = node_range[0].len());
+//                     let tmp = [base, &nn[..]].concat();
+//                     if tmp == my_name {
+//                         // my_pe = pe_cnt;
+//                     }
+//                     for _i in 0..num_pe_node {
+//                         pe_addrs.push(tmp.clone());
+//                         // pe_cnt += 1;
+//                     }
+//                 }
+//             } else {
+//                 let tmp = [base, node_range[0]].concat();
+//                 if tmp == my_name {
+//                     // my_pe = pe_cnt;
+//                 }
+//                 for _i in 0..num_pe_node {
+//                     pe_addrs.push(tmp.clone());
+//                     // pe_cnt += 1;
+//                 }
+//             }
+//         }
+//     } else {
+//         for _i in 0..num_pe_node {
+//             pe_addrs.push("localhost".to_string());
+//         }
+//     }
+//     Arch {
+//         my_pe: my_pe,
+//         num_pes: num_pes,
+//         pe_addrs: pe_addrs,
+//         job_id: job_id,
+//     }
+// }
 
-#[cfg(feature = "SocketsBackend")]
-pub(crate) fn parse_localhost() -> Arch {
-    Arch {
-        my_pe: 0,
-        num_pes: 1,
-        pe_addrs: vec!["localhost".to_string()],
-        job_id: 0,
-    }
-}
+// #[cfg(feature = "SocketsBackend")]
+// pub(crate) fn parse_localhost() -> Arch {
+//     Arch {
+//         my_pe: 0,
+//         num_pes: 1,
+//         pe_addrs: vec!["localhost".to_string()],
+//         job_id: 0,
+//     }
+// }
diff --git a/tests/array/arithmetic_ops/add_test.rs b/tests/array/arithmetic_ops/add_test.rs
index 250c9e45..d8959006 100644
--- a/tests/array/arithmetic_ops/add_test.rs
+++ b/tests/array/arithmetic_ops/add_test.rs
@@ -464,7 +464,7 @@ macro_rules! input_test {
         println!("passed &UnsafeArray<T>");
 
         // ReadOnlyArray<T>------------------------------
-        let input_array = input_array.into_read_only();
+        let input_array = input_array.into_read_only().block();
         // array.add(input_array.clone(),1);
         // check_results!($array,array,num_pes,"ReadOnlyArray<T>");
         // ReadOnlyArray<T>------------------------------
@@ -474,7 +474,7 @@ macro_rules! input_test {
         println!("passed &ReadOnlyArray<T>");
 
         // AtomicArray<T>------------------------------
-        let input_array = input_array.into_atomic();
+        let input_array = input_array.into_atomic().block();
         // array.add(input_array.clone(),1);
         // check_results!($array,array,num_pes,"AtomicArray<T>");
         // AtomicArray<T>------------------------------
@@ -484,7 +484,7 @@ macro_rules! input_test {
         println!("passed &AtomicArray<T>");
 
         // LocalLockArray<T>------------------------------
-        let input_array = input_array.into_local_lock();
+        let input_array = input_array.into_local_lock().block();
         //  array.add(input_array.clone(),1);
         //  check_results!($array,array,num_pes,"LocalLockArray<T>");
         // LocalLockArray<T>------------------------------
@@ -498,7 +498,7 @@ macro_rules! input_test {
         println!("passed &LocalLockArray<T>");
 
         // GlobalLockArray<T>------------------------------
-        let input_array = input_array.into_global_lock();
+        let input_array = input_array.into_global_lock().block();
         //  array.add(input_array.clone(),1);
         //  check_results!($array,array,num_pes,"GlobalLockArray<T>");
         // GlobalLockArray<T>------------------------------
diff --git a/tests/array/arithmetic_ops/fetch_add_test.rs b/tests/array/arithmetic_ops/fetch_add_test.rs
index fbded8e5..ebeadce8 100644
--- a/tests/array/arithmetic_ops/fetch_add_test.rs
+++ b/tests/array/arithmetic_ops/fetch_add_test.rs
@@ -111,214 +111,221 @@ macro_rules! buffered_onesided_iter {
     };
 }
 
-macro_rules! fetch_add_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
-
-            let mut rng = rand::thread_rng();
-            let rand_idx = Uniform::from(0..array_total_len);
+macro_rules! fetch_add_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
+
+        let mut rng = rand::thread_rng();
+        let rand_idx = Uniform::from(0..array_total_len);
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
+
+        let pe_max_val: $t = 10 as $t;
+        let max_val = pe_max_val * num_pes as $t;
+        let init_val = 0 as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        for idx in 0..array.len() {
+            let mut reqs = vec![];
+            for _i in 0..(pe_max_val as usize) {
+                #[allow(unused_unsafe)]
+                reqs.push(unsafe { array.fetch_add(idx, 1 as $t).spawn() });
+            }
             #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
-
-            let pe_max_val: $t = 10 as $t;
-            let max_val = pe_max_val * num_pes as $t;
-            let init_val = 0 as $t;
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            for idx in 0..array.len(){
-                let mut reqs = vec![];
-                for _i in 0..(pe_max_val as usize){
-                    #[allow(unused_unsafe)]
-                    reqs.push( unsafe{ array.fetch_add(idx,1 as $t).spawn()});
-                }
-                #[allow(unused_mut)]
-                let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("full 1: {:?} {:?}",val,prevs);
-                        success = false;
-                    }
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("full 1: {:?} {:?}", val, prevs);
+                    success = false;
                 }
             }
-            // array.wait_all();
-            array.barrier();
-            #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{ onesided_iter!($array,array).into_iter().enumerate()}{
-                let val = *elem;
-                check_val!($array,val,max_val,success);
-                if !success{
-                   eprintln!("full 2: {:?} {:?} {:?}",i,val,max_val);
-                }
+        }
+        // array.wait_all();
+        array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, max_val, success);
+            if !success {
+                eprintln!("full 2: {:?} {:?} {:?}", i, val, max_val);
             }
-            array.barrier();
-            // println!("1------------");
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            let num_updates=max_updates!($t,num_pes);
+        }
+        array.barrier();
+        // println!("1------------");
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        let num_updates = max_updates!($t, num_pes);
+        let mut reqs = vec![];
+        // println!("2------------");
+        for _i in 0..num_updates {
+            let idx = rand_idx.sample(&mut rng);
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { array.fetch_add(idx, 1 as $t) }, idx))
+        }
+        for (req, _idx) in reqs {
+            let _val = world.block_on(req) as usize;
+        }
+        array.barrier();
+        #[allow(unused_unsafe)]
+        let sum = unsafe {
+            onesided_iter!($array, array)
+                .into_iter()
+                .fold(0, |acc, x| acc + *x as usize)
+        };
+        let tot_updates = num_updates * num_pes;
+        check_val!($array, sum, tot_updates, success);
+        if !success {
+            eprintln!("full 4: {:?} {:?}", sum, tot_updates);
+        }
+        world.wait_all();
+        world.barrier();
+        // println!("2------------");
+        initialize_array!($array, array, init_val);
+
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let rand_idx = Uniform::from(0..half_len);
+        let sub_array = array.sub_array(start_i..end_i);
+        array.barrier();
+        for idx in 0..sub_array.len() {
             let mut reqs = vec![];
-            // println!("2------------");
-            for _i in 0..num_updates{
-                let idx = rand_idx.sample(&mut rng);
+            for _i in 0..(pe_max_val as usize) {
                 #[allow(unused_unsafe)]
-                reqs.push(( unsafe{ array.fetch_add(idx,1 as $t)},idx))
+                reqs.push(unsafe { sub_array.fetch_add(idx, 1 as $t) });
             }
-            for (req,_idx) in reqs{
-                let _val =  world.block_on(req) as usize;
+            #[allow(unused_mut)]
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("half 1: {:?} {:?}", val, prevs);
+                    success = false;
+                }
             }
-            array.barrier();
-            #[allow(unused_unsafe)]
-            let sum = unsafe{onesided_iter!($array,array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-            let tot_updates = num_updates * num_pes;
-            check_val!($array,sum,tot_updates,success);
-            if !success{
-                eprintln!("full 4: {:?} {:?}",sum,tot_updates);
+        }
+        array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, max_val, success);
+            if !success {
+                eprintln!("half 2: {:?} {:?} {:?}", i, val, max_val);
             }
-            world.wait_all();
-            world.barrier();
-            // println!("2------------");
-            initialize_array!($array, array, init_val);
-
-
-
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
-            let rand_idx = Uniform::from(0..half_len);
+        }
+        array.barrier();
+        // println!("3------------");
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        let num_updates = max_updates!($t, num_pes);
+        let mut reqs = vec![];
+        for _i in 0..num_updates {
+            let idx = rand_idx.sample(&mut rng);
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { sub_array.fetch_add(idx, 1 as $t) }, idx))
+        }
+        for (req, _idx) in reqs {
+            let _val = world.block_on(req) as usize;
+        }
+        array.barrier();
+        #[allow(unused_unsafe)]
+        let sum = unsafe {
+            onesided_iter!($array, sub_array)
+                .into_iter()
+                .fold(0, |acc, x| acc + *x as usize)
+        };
+        let tot_updates = num_updates * num_pes;
+        check_val!($array, sum, tot_updates, success);
+        if !success {
+            eprintln!("half 4: {:?} {:?}", sum, tot_updates);
+        }
+        array.wait_all();
+        array.barrier();
+        // println!("4------------");
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
+            let rand_idx = Uniform::from(0..len);
             let sub_array = array.sub_array(start_i..end_i);
             array.barrier();
-            for idx in 0..sub_array.len(){
+            for idx in 0..sub_array.len() {
                 let mut reqs = vec![];
-                for _i in 0..(pe_max_val as usize){
+                for _i in 0..(pe_max_val as usize) {
                     #[allow(unused_unsafe)]
-                    reqs.push( unsafe{ sub_array.fetch_add(idx,1 as $t)});
+                    reqs.push(unsafe { sub_array.fetch_add(idx, 1 as $t) });
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("half 1: {:?} {:?}",val,prevs);
+                for req in reqs {
+                    let val = world.block_on(req) as u128;
+                    if !insert_prev!($array, val, prevs) {
+                        eprintln!("pe 1: {:?} {:?}", val, prevs);
                         success = false;
                     }
                 }
-
             }
-            array.barrier();
+            sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{ onesided_iter!($array,sub_array).into_iter().enumerate()} {
+            for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
                 let val = *elem;
-                check_val!($array,val,max_val,success);
-                if !success{
-                    eprintln!("half 2: {:?} {:?} {:?}",i,val,max_val);
+                check_val!($array, val, max_val, success);
+                if !success {
+                    eprintln!("pe 2 {:?} {:?} {:?}", i, val, max_val);
                 }
             }
-            array.barrier();
-            // println!("3------------");
+            sub_array.barrier();
+            // println!("5------------");
             initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            let num_updates=max_updates!($t,num_pes);
+            sub_array.wait_all();
+            sub_array.barrier();
+            let num_updates = max_updates!($t, num_pes);
             let mut reqs = vec![];
-            for _i in 0..num_updates{
+            for _i in 0..num_updates {
                 let idx = rand_idx.sample(&mut rng);
                 #[allow(unused_unsafe)]
-                reqs.push(( unsafe{ sub_array.fetch_add(idx,1 as $t)},idx))
+                reqs.push((unsafe { sub_array.fetch_add(idx, 1 as $t) }, idx))
             }
-            for (req,_idx) in reqs{
-                let _val =  world.block_on(req) as usize;
+            for (req, _idx) in reqs {
+                let _val = world.block_on(req) as usize;
             }
-            array.barrier();
+            sub_array.barrier();
             #[allow(unused_unsafe)]
-            let sum = unsafe {onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
+            let sum = unsafe {
+                onesided_iter!($array, sub_array)
+                    .into_iter()
+                    .fold(0, |acc, x| acc + *x as usize)
+            };
             let tot_updates = num_updates * num_pes;
-            check_val!($array,sum,tot_updates,success);
-            if !success{
-                eprintln!("half 4: {:?} {:?}",sum,tot_updates);
+            check_val!($array, sum, tot_updates, success);
+            if !success {
+                eprintln!("pe 4 {:?} {:?}", sum, tot_updates);
             }
-            array.wait_all();
-            array.barrier();
-            // println!("4------------");
+            sub_array.wait_all();
+            sub_array.barrier();
+            // println!("6------------");
             initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let rand_idx = Uniform::from(0..len);
-                let sub_array = array.sub_array(start_i..end_i);
-                array.barrier();
-                for idx in 0..sub_array.len(){
-                    let mut reqs = vec![];
-                    for _i in 0..(pe_max_val as usize){
-                        #[allow(unused_unsafe)]
-                        reqs.push( unsafe{ sub_array.fetch_add(idx,1 as $t)});
-                    }
-                    #[allow(unused_mut)]
-                    let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                    for req in reqs{
-                        let val =  world.block_on(req) as u128;
-                        if ! insert_prev!($array,val,prevs){
-                            eprintln!("pe 1: {:?} {:?}",val,prevs);
-                            success = false;
-                        }
-                    }
-
-                }
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
-                    let val = *elem;
-                    check_val!($array,val,max_val,success);
-                    if !success{
-                        eprintln!("pe 2 {:?} {:?} {:?}",i,val,max_val);
-                    }
-                }
-                sub_array.barrier();
-                // println!("5------------");
-                initialize_array!($array, array, init_val);
-                sub_array.wait_all();
-                sub_array.barrier();
-                let num_updates=max_updates!($t,num_pes);
-                let mut reqs = vec![];
-                for _i in 0..num_updates{
-                    let idx = rand_idx.sample(&mut rng);
-                    #[allow(unused_unsafe)]
-                    reqs.push(( unsafe{ sub_array.fetch_add(idx,1 as $t)},idx))
-                }
-                for (req,_idx) in reqs{
-                    let _val =  world.block_on(req) as usize;
-                }
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-                let tot_updates = num_updates * num_pes;
-                check_val!($array,sum,tot_updates,success);
-                if !success{
-                    eprintln!("pe 4 {:?} {:?}",sum,tot_updates);
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                // println!("6------------");
-                initialize_array!($array, array, init_val);
-            }
+        }
 
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 macro_rules! initialize_array2 {
@@ -425,188 +432,212 @@ macro_rules! check_results {
     };
 }
 
-macro_rules! input_test{
-    ($array:ident,  $len:expr, $dist:ident) =>{
-       {
-            std::env::set_var("LAMELLAR_BATCH_OP_SIZE","10");
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
-
-            // let mut success = true;
-            let array: $array::<usize> = $array::<usize>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
-            let input_array: UnsafeArray::<usize> = UnsafeArray::<usize>::new(world.team(), array_total_len*num_pes, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
-            // let init_val=0;
-            initialize_array2!($array, array, init_val);
-            if $dist == lamellar::array::Distribution::Block{
-                #[allow(unused_unsafe)]
-                unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i%array_total_len}).block()};
-            }
-            else{
-                #[allow(unused_unsafe)]
-                unsafe { input_array.dist_iter_mut().enumerate().for_each(move |(i,x)| {/*println!("i: {:?}",i);*/ *x = i/num_pes}).block()};
-            }
-            array.barrier();
-            //individual T------------------------------
-            let mut reqs = vec![];
-            for i in 0..array.len(){
-                #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(i,1).spawn()});
-            }
-            check_results!($array,array,num_pes,reqs,"T");
-            //individual T------------------------------
-            let mut reqs = vec![];
-            for i in 0..array.len(){
-                #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(&i,1).spawn()});
-            }
-            check_results!($array,array,num_pes,reqs,"&T");
-            //&[T]------------------------------
-            // multi_idx single val
-            let idx=(0..array.len()).collect::<Vec<usize>>();
-            let idx_slice = &idx[..];
-            let vals=vec![1;array.len()];
-            let vals_slice = &vals[..];
+macro_rules! input_test {
+    ($array:ident,  $len:expr, $dist:ident) => {{
+        std::env::set_var("LAMELLAR_BATCH_OP_SIZE", "10");
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
 
-            let mut reqs = vec![];
+        // let mut success = true;
+        let array: $array<usize> = $array::<usize>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
+        let input_array: UnsafeArray<usize> =
+            UnsafeArray::<usize>::new(world.team(), array_total_len * num_pes, $dist)
+                .block()
+                .into(); //convert into abstract LamellarArray, distributed len is total_len
+                         // let init_val=0;
+        initialize_array2!($array, array, init_val);
+        if $dist == lamellar::array::Distribution::Block {
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(idx_slice,1).spawn()});
-            check_results!($array,array,num_pes,reqs,"&[T]");
-            // single_idx multi_ val
+            unsafe {
+                input_array
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each(move |(i, x)| {
+                        /*println!("i: {:?}",i);*/
+                        *x = i % array_total_len
+                    })
+                    .block()
+            };
+        } else {
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(_my_pe,&vals).spawn()});
-            let real_val = array.len();
-            check_results!($array,array,num_pes, real_val,reqs,"&[T]");
-            // multi_idx multi_ val
+            unsafe {
+                input_array
+                    .dist_iter_mut()
+                    .enumerate()
+                    .for_each(move |(i, x)| {
+                        /*println!("i: {:?}",i);*/
+                        *x = i / num_pes
+                    })
+                    .block()
+            };
+        }
+        array.barrier();
+        //individual T------------------------------
+        let mut reqs = vec![];
+        for i in 0..array.len() {
             #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(idx_slice,vals_slice).spawn()});
-
-            check_results!($array,array,num_pes,reqs,"&[T]");
-            //scoped &[T]------------------------------
-            let mut reqs = vec![];
-            {
-                let vec=(0..array.len()).collect::<Vec<usize>>();
-                let slice = &vec[..];
-                #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(slice,1).spawn()});
-            }
-            check_results!($array,array,num_pes,reqs,"scoped &[T]");
-            // Vec<T>------------------------------
-            let vec=(0..array.len()).collect::<Vec<usize>>();
-            let mut reqs = vec![];
+            reqs.push(unsafe { array.batch_fetch_add(i, 1).spawn() });
+        }
+        check_results!($array, array, num_pes, reqs, "T");
+        //individual T------------------------------
+        let mut reqs = vec![];
+        for i in 0..array.len() {
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(vec,1).spawn()});
-            check_results!($array,array,num_pes,reqs,"Vec<T>");
-            // &Vec<T>------------------------------
-            let mut reqs = vec![];
-            let vec=(0..array.len()).collect::<Vec<usize>>();
+            reqs.push(unsafe { array.batch_fetch_add(&i, 1).spawn() });
+        }
+        check_results!($array, array, num_pes, reqs, "&T");
+        //&[T]------------------------------
+        // multi_idx single val
+        let idx = (0..array.len()).collect::<Vec<usize>>();
+        let idx_slice = &idx[..];
+        let vals = vec![1; array.len()];
+        let vals_slice = &vals[..];
+
+        let mut reqs = vec![];
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(idx_slice, 1).spawn() });
+        check_results!($array, array, num_pes, reqs, "&[T]");
+        // single_idx multi_ val
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(_my_pe, &vals).spawn() });
+        let real_val = array.len();
+        check_results!($array, array, num_pes, real_val, reqs, "&[T]");
+        // multi_idx multi_ val
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(idx_slice, vals_slice).spawn() });
+
+        check_results!($array, array, num_pes, reqs, "&[T]");
+        //scoped &[T]------------------------------
+        let mut reqs = vec![];
+        {
+            let vec = (0..array.len()).collect::<Vec<usize>>();
+            let slice = &vec[..];
             #[allow(unused_unsafe)]
-            reqs.push( unsafe{ array.batch_fetch_add(&vec,1).spawn()});
-            check_results!($array,array,num_pes,reqs,"&Vec<T>");
-            // Scoped Vec<T>------------------------------
-            let mut reqs = vec![];
-            {
-                let vec=(0..array.len()).collect::<Vec<usize>>();
-                #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(vec,1).spawn()});
-            }
-            check_results!($array,array,num_pes,reqs,"scoped Vec<T>");
-            // Scoped &Vec<T>------------------------------
-            let mut reqs = vec![];
-            {
-                let vec=(0..array.len()).collect::<Vec<usize>>();
-                #[allow(unused_unsafe)]
-                reqs.push( unsafe{ array.batch_fetch_add(&vec,1).spawn()});
-            }
-            check_results!($array,array,num_pes,reqs,"scoped &Vec<T>");
+            reqs.push(unsafe { array.batch_fetch_add(slice, 1).spawn() });
+        }
+        check_results!($array, array, num_pes, reqs, "scoped &[T]");
+        // Vec<T>------------------------------
+        let vec = (0..array.len()).collect::<Vec<usize>>();
+        let mut reqs = vec![];
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(vec, 1).spawn() });
+        check_results!($array, array, num_pes, reqs, "Vec<T>");
+        // &Vec<T>------------------------------
+        let mut reqs = vec![];
+        let vec = (0..array.len()).collect::<Vec<usize>>();
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(&vec, 1).spawn() });
+        check_results!($array, array, num_pes, reqs, "&Vec<T>");
+        // Scoped Vec<T>------------------------------
+        let mut reqs = vec![];
+        {
+            let vec = (0..array.len()).collect::<Vec<usize>>();
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe { array.batch_fetch_add(vec, 1).spawn() });
+        }
+        check_results!($array, array, num_pes, reqs, "scoped Vec<T>");
+        // Scoped &Vec<T>------------------------------
+        let mut reqs = vec![];
+        {
+            let vec = (0..array.len()).collect::<Vec<usize>>();
+            #[allow(unused_unsafe)]
+            reqs.push(unsafe { array.batch_fetch_add(&vec, 1).spawn() });
+        }
+        check_results!($array, array, num_pes, reqs, "scoped &Vec<T>");
 
-            // scoped &LMR<T>------------------------------
-            let mut reqs = vec![];
-            unsafe {
-                let lmr=world.alloc_one_sided_mem_region(array.len());
-                let slice = lmr.as_mut_slice().unwrap();
-                for i in 0..array.len(){
-                    slice[i]=i;
-                }
-                reqs.push(array.batch_fetch_add(slice,1).spawn());
-                check_results!($array,array,num_pes,reqs,"scoped &LMR<T>");
+        // scoped &LMR<T>------------------------------
+        let mut reqs = vec![];
+        unsafe {
+            let lmr = world.alloc_one_sided_mem_region(array.len());
+            let slice = lmr.as_mut_slice().unwrap();
+            for i in 0..array.len() {
+                slice[i] = i;
             }
+            reqs.push(array.batch_fetch_add(slice, 1).spawn());
+            check_results!($array, array, num_pes, reqs, "scoped &LMR<T>");
+        }
 
-            // scoped SMR<T>------------------------------
-            let mut reqs = vec![];
-            unsafe {
-                let smr=world.alloc_shared_mem_region(array.len()).block();
-                let slice = smr.as_mut_slice().unwrap();
-                for i in 0..array.len(){
-                    slice[i]=i;
-                }
-
-                reqs.push(array.batch_fetch_add(slice,1).spawn());
-                check_results!($array,array,num_pes,reqs,"scoped SMR<T>");
+        // scoped SMR<T>------------------------------
+        let mut reqs = vec![];
+        unsafe {
+            let smr = world.alloc_shared_mem_region(array.len()).block();
+            let slice = smr.as_mut_slice().unwrap();
+            for i in 0..array.len() {
+                slice[i] = i;
             }
-            // UnsafeArray<T>------------------------------
-            // let mut reqs = vec![];
-            // reqs.push(array.fetch_add(input_array.clone(),1));
-            // check_results!($array,array,num_pes,reqs,"UnsafeArray<T>");
-            // UnsafeArray<T>------------------------------
-            let mut reqs = vec![];
-            #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1).spawn()});
-            check_results!($array,array,num_pes,reqs,"&UnsafeArray<T>");
-
-            // ReadOnlyArray<T>------------------------------
-            // let mut reqs = vec![];
-            let input_array = input_array.into_read_only();
-            // println!("read only array len: {:?}", input_array.len());
-            // reqs.push(array.fetch_add(input_array.clone(),1));
-            // check_results!($array,array,num_pes,reqs,"ReadOnlyArray<T>");
-            // ReadOnlyArray<T>------------------------------
-            let mut reqs = vec![];
-            #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(input_array.local_data(),1).spawn()});
-            check_results!($array,array,num_pes,reqs,"&ReadOnlyArray<T>");
-
-            // AtomicArray<T>------------------------------
-            // let mut reqs = vec![];
-            let input_array = input_array.into_atomic();
-            // println!("atomic array len: {:?}", input_array.len());
-            // reqs.push(array.fetch_add(input_array.clone(),1));
-            // check_results!($array,array,num_pes,reqs,"AtomicArray<T>");
-            // AtomicArray<T>------------------------------
-            let mut reqs = vec![];
-            #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(&input_array.local_data(),1).spawn()});
-            check_results!($array,array,num_pes,reqs,"&AtomicArray<T>");
-
-            // LocalLockArray<T>------------------------------
-            //  let mut reqs = vec![];
-            let input_array = input_array.into_local_lock();
-            //  println!("local lock array len: {:?}", input_array.len());
-            //  reqs.push(array.fetch_add(input_array.clone(),1));
-            //  check_results!($array,array,num_pes,reqs,"LocalLockArray<T>");
-            // LocalLockArray<T>------------------------------
-            let mut reqs = vec![];
-            let local_data = input_array.read_local_data().block();
-            // println!("local lock array len: {:?}", local_data.deref());
-            #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(&local_data,1).spawn()});
-            drop(local_data);
-            check_results!($array,array,num_pes,reqs,"&LocalLockArray<T>");
-
-            // GlobalLockArray<T>------------------------------
-            //  let mut reqs = vec![];
-            let input_array = input_array.into_global_lock();
-            // println!("global lock array len: {:?}", input_array.len());
-            //  reqs.push(array.fetch_add(input_array.clone(),1));
-            //  check_results!($array,array,num_pes,reqs,"GlobalLockArray<T>");
-            // GlobalLockArray<T>------------------------------
-            let mut reqs = vec![];
-            #[allow(unused_unsafe)]
-            reqs.push(unsafe{array.batch_fetch_add(&input_array.read_local_data().block(),1).spawn()});
-            check_results!($array,array,num_pes,reqs,"&GlobalLockArray<T>");
-       }
-    }
+
+            reqs.push(array.batch_fetch_add(slice, 1).spawn());
+            check_results!($array, array, num_pes, reqs, "scoped SMR<T>");
+        }
+        // UnsafeArray<T>------------------------------
+        // let mut reqs = vec![];
+        // reqs.push(array.fetch_add(input_array.clone(),1));
+        // check_results!($array,array,num_pes,reqs,"UnsafeArray<T>");
+        // UnsafeArray<T>------------------------------
+        let mut reqs = vec![];
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(input_array.local_data(), 1).spawn() });
+        check_results!($array, array, num_pes, reqs, "&UnsafeArray<T>");
+
+        // ReadOnlyArray<T>------------------------------
+        // let mut reqs = vec![];
+        let input_array = input_array.into_read_only().block();
+        // println!("read only array len: {:?}", input_array.len());
+        // reqs.push(array.fetch_add(input_array.clone(),1));
+        // check_results!($array,array,num_pes,reqs,"ReadOnlyArray<T>");
+        // ReadOnlyArray<T>------------------------------
+        let mut reqs = vec![];
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(input_array.local_data(), 1).spawn() });
+        check_results!($array, array, num_pes, reqs, "&ReadOnlyArray<T>");
+
+        // AtomicArray<T>------------------------------
+        // let mut reqs = vec![];
+        let input_array = input_array.into_atomic().block();
+        // println!("atomic array len: {:?}", input_array.len());
+        // reqs.push(array.fetch_add(input_array.clone(),1));
+        // check_results!($array,array,num_pes,reqs,"AtomicArray<T>");
+        // AtomicArray<T>------------------------------
+        let mut reqs = vec![];
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(&input_array.local_data(), 1).spawn() });
+        check_results!($array, array, num_pes, reqs, "&AtomicArray<T>");
+
+        // LocalLockArray<T>------------------------------
+        //  let mut reqs = vec![];
+        let input_array = input_array.into_local_lock().block();
+        //  println!("local lock array len: {:?}", input_array.len());
+        //  reqs.push(array.fetch_add(input_array.clone(),1));
+        //  check_results!($array,array,num_pes,reqs,"LocalLockArray<T>");
+        // LocalLockArray<T>------------------------------
+        let mut reqs = vec![];
+        let local_data = input_array.read_local_data().block();
+        // println!("local lock array len: {:?}", local_data.deref());
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe { array.batch_fetch_add(&local_data, 1).spawn() });
+        drop(local_data);
+        check_results!($array, array, num_pes, reqs, "&LocalLockArray<T>");
+
+        // GlobalLockArray<T>------------------------------
+        //  let mut reqs = vec![];
+        let input_array = input_array.into_global_lock().block();
+        // println!("global lock array len: {:?}", input_array.len());
+        //  reqs.push(array.fetch_add(input_array.clone(),1));
+        //  check_results!($array,array,num_pes,reqs,"GlobalLockArray<T>");
+        // GlobalLockArray<T>------------------------------
+        let mut reqs = vec![];
+        #[allow(unused_unsafe)]
+        reqs.push(unsafe {
+            array
+                .batch_fetch_add(&input_array.read_local_data().block(), 1)
+                .spawn()
+        });
+        check_results!($array, array, num_pes, reqs, "&GlobalLockArray<T>");
+    }};
 }
 
 fn main() {
diff --git a/tests/array/rdma/blocking_get_test.rs b/tests/array/rdma/blocking_get_test.rs
index fe34f4fd..33dfd579 100644
--- a/tests/array/rdma/blocking_get_test.rs
+++ b/tests/array/rdma/blocking_get_test.rs
@@ -54,14 +54,14 @@ macro_rules! initialize_array {
         $array.wait_all();
     };
     (ReadOnlyArray,$array:ident,$t:ty) => {
-        let temp = $array.into_unsafe();
+        let temp = $array.into_unsafe().block();
         unsafe {
             temp.dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t)
                 .block();
         }
-        $array = temp.into_read_only();
+        $array = temp.into_read_only().block();
     };
 }
 
@@ -101,7 +101,7 @@ macro_rules! initialize_array_range {
             .block();
     }};
     (ReadOnlyArray,$array:ident,$t:ty,$range:expr) => {{
-        let temp = $array.into_unsafe();
+        let temp = $array.into_unsafe().block();
         let subarray = temp.sub_array($range);
         unsafe {
             subarray
@@ -111,7 +111,7 @@ macro_rules! initialize_array_range {
                 .block();
         }
         drop(subarray);
-        $array = temp.into_read_only();
+        $array = temp.into_read_only().block();
     }};
 }
 
diff --git a/tests/array/rdma/get_test.rs b/tests/array/rdma/get_test.rs
index 261d53f2..074456a4 100644
--- a/tests/array/rdma/get_test.rs
+++ b/tests/array/rdma/get_test.rs
@@ -48,14 +48,14 @@ macro_rules! initialize_array {
     };
     (ReadOnlyArray,$array:ident,$t:ty) => {
         // println!("into unsafe");
-        let temp = $array.into_unsafe();
+        let temp = $array.into_unsafe().block();
         // println!("unsafe");
         unsafe {
             temp.dist_iter_mut()
                 .enumerate()
                 .for_each(move |(i, x)| *x = i as $t)
                 .block();
-            $array = temp.into_read_only();
+            $array = temp.into_read_only().block();
         }
     };
 }
@@ -97,7 +97,7 @@ macro_rules! initialize_array_range {
     }};
     (ReadOnlyArray,$array:ident,$t:ty,$range:expr) => {{
         // println!("into unsafe");
-        let temp = $array.into_unsafe();
+        let temp = $array.into_unsafe().block();
         // println!("unsafe");
         unsafe {
             let subarray = temp.sub_array($range);
@@ -109,7 +109,7 @@ macro_rules! initialize_array_range {
             drop(subarray);
         }
         println!("into read only");
-        $array = temp.into_read_only();
+        $array = temp.into_read_only().block();
         println!("read only");
     }};
 }

From 9d6010522f32797e3bb8411a1b5d7803c129f38b Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Mon, 11 Nov 2024 16:12:48 -0800
Subject: [PATCH 107/116] all test/doc test/examples compiling + passing

---
 src/array/generic_atomic.rs     | 35 +++++++++------------------------
 src/array/global_lock_atomic.rs |  2 +-
 src/array/local_lock_atomic.rs  |  2 +-
 tests/add.rs                    |  2 +-
 tests/and.rs                    |  2 +-
 tests/blocking_get.rs           |  2 +-
 tests/compare_exchange.rs       |  2 +-
 tests/div.rs                    |  2 +-
 tests/fetch_add.rs              |  4 ++--
 tests/fetch_and.rs              |  2 +-
 tests/fetch_div.rs              |  2 +-
 tests/fetch_mul.rs              |  2 +-
 tests/fetch_or.rs               |  2 +-
 tests/fetch_rem.rs              |  2 +-
 tests/fetch_sub.rs              |  2 +-
 tests/fetch_xor.rs              |  2 +-
 tests/get.rs                    |  2 +-
 tests/load_store.rs             |  2 +-
 tests/mul.rs                    |  2 +-
 tests/or.rs                     |  2 +-
 tests/put.rs                    |  2 +-
 tests/rem.rs                    |  2 +-
 tests/sub.rs                    |  2 +-
 tests/swap.rs                   |  2 +-
 tests/xor.rs                    |  2 +-
 25 files changed, 34 insertions(+), 51 deletions(-)

diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 96adb94f..64d69bb2 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -599,14 +599,14 @@ impl<T: Dist + 'static> GenericAtomicArray<T> {
     }
 }
 
-impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
-    fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let (vals, distribution) = input;
-        let input = (&vals, distribution);
-        let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-        array.into()
-    }
-}
+// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
+//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//         let (vals, distribution) = input;
+//         let input = (&vals, distribution);
+//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
+//         array.into()
+//     }
+// }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
@@ -616,23 +616,6 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GenericAtomic
     }
 }
 
-impl<T: Dist> From<UnsafeArray<T>> for GenericAtomicArray<T> {
-    fn from(array: UnsafeArray<T>) -> Self {
-        // println!("generic from unsafe array");
-        array.block_on_outstanding(DarcMode::GenericAtomicArray);
-        let mut vec = vec![];
-        for _i in 0..array.num_elems_local() {
-            vec.push(Mutex::new(()));
-        }
-        let locks = Darc::new(array.team_rt(), vec).block().unwrap();
-
-        GenericAtomicArray {
-            locks: locks,
-            array: array,
-        }
-    }
-}
-
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GenericAtomicArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
@@ -644,7 +627,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GenericAtomicArray<T> {
         for _i in 0..array.num_elems_local() {
             vec.push(Mutex::new(()));
         }
-        let locks = Darc::new(array.team_rt(), vec).block().unwrap();
+        let locks = Darc::new(array.team_rt(), vec).await.expect("PE in team");
 
         GenericAtomicArray {
             locks: locks,
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index dbd0058e..14b146d0 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -745,7 +745,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
         // println!("GlobalLock from unsafe");
         array.await_on_outstanding(DarcMode::GlobalLockArray).await;
-        let lock = GlobalRwDarc::new(array.team_rt(), ()).block().unwrap();
+        let lock = GlobalRwDarc::new(array.team_rt(), ()).await.expect("PE in team");
 
         GlobalLockArray {
             lock: lock,
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index cd236a16..21a8f7c9 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -713,7 +713,7 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
         // println!("locallock from unsafe");
         array.await_on_outstanding(DarcMode::LocalLockArray).await;
-        let lock = LocalRwDarc::new(array.team_rt(), ()).block().unwrap();
+        let lock = LocalRwDarc::new(array.team_rt(), ()).await.expect("PE in team");
 
         LocalLockArray {
             lock: lock,
diff --git a/tests/add.rs b/tests/add.rs
index 15995de0..598b916b 100644
--- a/tests/add.rs
+++ b/tests/add.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/add_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/and.rs b/tests/and.rs
index 7ff22aba..33f1fc9b 100644
--- a/tests/and.rs
+++ b/tests/and.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/and_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/blocking_get.rs b/tests/blocking_get.rs
index 60636b46..63a04e75 100644
--- a/tests/blocking_get.rs
+++ b/tests/blocking_get.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/blocking_get_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/compare_exchange.rs b/tests/compare_exchange.rs
index bed1c09e..e8bbaf27 100644
--- a/tests/compare_exchange.rs
+++ b/tests/compare_exchange.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/compare_exchange_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/div.rs b/tests/div.rs
index f397d026..e436a16f 100644
--- a/tests/div.rs
+++ b/tests/div.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/div_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_add.rs b/tests/fetch_add.rs
index 678571c1..9bf729ff 100644
--- a/tests/fetch_add.rs
+++ b/tests/fetch_add.rs
@@ -17,7 +17,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_add_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
@@ -39,7 +39,7 @@ macro_rules! create_test {
                     .arg("--mpi=pmi2")
                     .arg("./target/release/examples/fetch_add_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_and.rs b/tests/fetch_and.rs
index e773e402..ac4ec00e 100644
--- a/tests/fetch_and.rs
+++ b/tests/fetch_and.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_and_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_div.rs b/tests/fetch_div.rs
index 8634d580..2aaaea76 100644
--- a/tests/fetch_div.rs
+++ b/tests/fetch_div.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_div_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_mul.rs b/tests/fetch_mul.rs
index 1922185b..3414d4b0 100644
--- a/tests/fetch_mul.rs
+++ b/tests/fetch_mul.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_mul_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_or.rs b/tests/fetch_or.rs
index c39ce267..61990a7f 100644
--- a/tests/fetch_or.rs
+++ b/tests/fetch_or.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_or_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_rem.rs b/tests/fetch_rem.rs
index 5d6794a4..6b73b1a1 100644
--- a/tests/fetch_rem.rs
+++ b/tests/fetch_rem.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_rem_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_sub.rs b/tests/fetch_sub.rs
index 814369ad..72b8754f 100644
--- a/tests/fetch_sub.rs
+++ b/tests/fetch_sub.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_sub_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/fetch_xor.rs b/tests/fetch_xor.rs
index d4c40471..bc935831 100644
--- a/tests/fetch_xor.rs
+++ b/tests/fetch_xor.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/fetch_xor_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/get.rs b/tests/get.rs
index 6b96dd8d..42042f22 100644
--- a/tests/get.rs
+++ b/tests/get.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/get_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/load_store.rs b/tests/load_store.rs
index 252f65c6..6ec8e0d6 100644
--- a/tests/load_store.rs
+++ b/tests/load_store.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/load_store_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/mul.rs b/tests/mul.rs
index a16af4fa..4a8bd5e7 100644
--- a/tests/mul.rs
+++ b/tests/mul.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/mul_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/or.rs b/tests/or.rs
index 796ee59b..59226374 100644
--- a/tests/or.rs
+++ b/tests/or.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/or_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/put.rs b/tests/put.rs
index b67e4d99..5408561d 100644
--- a/tests/put.rs
+++ b/tests/put.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/put_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/rem.rs b/tests/rem.rs
index 9644494b..3867436d 100644
--- a/tests/rem.rs
+++ b/tests/rem.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/rem_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/sub.rs b/tests/sub.rs
index 71777929..509f76a5 100644
--- a/tests/sub.rs
+++ b/tests/sub.rs
@@ -35,7 +35,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/sub_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/swap.rs b/tests/swap.rs
index e9db89d5..e77160c6 100644
--- a/tests/swap.rs
+++ b/tests/swap.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/swap_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();
diff --git a/tests/xor.rs b/tests/xor.rs
index 758bae6c..25bb66cc 100644
--- a/tests/xor.rs
+++ b/tests/xor.rs
@@ -16,7 +16,7 @@ macro_rules! create_test {
                     .arg("-T=4")
                     .arg("./target/release/examples/xor_test")
                     .arg(stringify!($array))
-                    .arg($dist).block();
+                    .arg($dist)
                     .arg(stringify!($elem))
                     .arg(stringify!($len))
                     .assert();

From 89c43acb8828ebb1b761211dba7a2e405e5de2f7 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 12 Nov 2024 22:26:27 -0800
Subject: [PATCH 108/116] clean up warnings

---
 src/active_messaging/prelude.rs               |   2 +-
 src/array.rs                                  | 201 ++++++++++++------
 src/array/atomic.rs                           |  61 +-----
 src/array/atomic/iteration.rs                 |   5 +-
 src/array/generic_atomic.rs                   |  19 +-
 src/array/generic_atomic/handle.rs            |  19 --
 src/array/generic_atomic/rdma.rs              |   6 +-
 src/array/global_lock_atomic.rs               |  48 +----
 src/array/global_lock_atomic/rdma.rs          |   6 +-
 src/array/iterator/distributed_iterator.rs    |   5 +-
 .../distributed_iterator/consumer/collect.rs  |   3 +-
 src/array/iterator/local_iterator.rs          |   5 +-
 .../local_iterator/consumer/collect.rs        |   3 +-
 .../iterator/one_sided_iterator/chunks.rs     |  10 +-
 src/array/local_lock_atomic.rs                |  49 +----
 src/array/local_lock_atomic/rdma.rs           |   6 +-
 src/array/local_only.rs                       |   2 +-
 src/array/native_atomic.rs                    |  33 +--
 src/array/native_atomic/handle.rs             |  19 --
 src/array/native_atomic/rdma.rs               |   6 +-
 src/array/operations/access.rs                |   9 +-
 src/array/operations/arithmetic.rs            |   9 +-
 src/array/operations/bitwise.rs               |   9 +-
 src/array/operations/compare_exchange.rs      |  18 +-
 src/array/operations/read_only.rs             |   5 +-
 src/array/operations/shift.rs                 |  41 +++-
 src/array/prelude.rs                          |   1 -
 src/array/read_only.rs                        |  62 +-----
 src/array/unsafe.rs                           |  30 +--
 src/array/unsafe/iteration/consumer.rs        |   2 +-
 src/array/unsafe/iteration/distributed.rs     |   8 +-
 src/array/unsafe/iteration/local.rs           |   9 +-
 src/array/unsafe/rdma.rs                      |  10 +-
 src/darc.rs                                   |  60 ------
 src/darc/prelude.rs                           |   1 -
 src/lamellae/command_queues.rs                |  18 +-
 src/lamellar_alloc.rs                         |   3 +
 src/lamellar_team.rs                          |  38 +---
 src/lib.rs                                    |   3 +-
 src/memregion.rs                              |  16 +-
 src/memregion/one_sided.rs                    |  16 +-
 src/memregion/prelude.rs                      |   1 -
 src/memregion/shared.rs                       |  10 +-
 src/warnings.rs                               |   1 +
 44 files changed, 325 insertions(+), 563 deletions(-)

diff --git a/src/active_messaging/prelude.rs b/src/active_messaging/prelude.rs
index 0dcab32a..6cfb237f 100644
--- a/src/active_messaging/prelude.rs
+++ b/src/active_messaging/prelude.rs
@@ -15,7 +15,7 @@ pub use crate::inventory;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
 //#[doc(hidden)]
-pub use crate::lamellar_team::{IntoLamellarTeam, LamellarTeamRT};
+pub use crate::lamellar_team::{IntoLamellarTeam};
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
 pub use crate::LamellarEnv;
diff --git a/src/array.rs b/src/array.rs
index d5f30a93..ffa71295 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -304,8 +304,8 @@ impl<T: Dist> LamellarRead for &[T] {}
 
 impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
     /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
-    fn team_from(val: &T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
+    fn team_from(val: &T, team: &Arc<LamellarTeam>) -> Self {
+        let buf: OneSidedMemoryRegion<T> = team.team.alloc_one_sided_mem_region(1);
         unsafe {
             buf.as_mut_slice().expect("Data should exist on PE")[0] = val.clone();
         }
@@ -315,8 +315,8 @@ impl<T: Dist> TeamFrom<&T> for LamellarArrayRdmaInput<T> {
 
 impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
     /// Constructs a single element [OneSidedMemoryRegion] and copies `val` into it
-    fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(1);
+    fn team_from(val: T, team: &Arc<LamellarTeam>) -> Self {
+        let buf: OneSidedMemoryRegion<T> = team.team.alloc_one_sided_mem_region(1);
         unsafe {
             buf.as_mut_slice().expect("Data should exist on PE")[0] = val;
         }
@@ -326,8 +326,8 @@ impl<T: Dist> TeamFrom<T> for LamellarArrayRdmaInput<T> {
 
 impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
-    fn team_from(vals: Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
+    fn team_from(vals: Vec<T>, team: &Arc<LamellarTeam>) -> Self {
+        let buf: OneSidedMemoryRegion<T> = team.team.alloc_one_sided_mem_region(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -340,8 +340,8 @@ impl<T: Dist> TeamFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
 }
 impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
-    fn team_from(vals: &Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
+    fn team_from(vals: &Vec<T>, team: &Arc<LamellarTeam>) -> Self {
+        let buf: OneSidedMemoryRegion<T> = team.team.alloc_one_sided_mem_region(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -354,8 +354,8 @@ impl<T: Dist> TeamFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
 }
 impl<T: Dist> TeamFrom<&[T]> for LamellarArrayRdmaInput<T> {
     /// Constructs a [OneSidedMemoryRegion] equal in length to `vals` and copies `vals` into it
-    fn team_from(vals: &[T], team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-        let buf: OneSidedMemoryRegion<T> = team.alloc_one_sided_mem_region(vals.len());
+    fn team_from(vals: &[T], team: &Arc<LamellarTeam>) -> Self {
+        let buf: OneSidedMemoryRegion<T> = team.team.alloc_one_sided_mem_region(vals.len());
         unsafe {
             std::ptr::copy_nonoverlapping(
                 vals.as_ptr(),
@@ -368,43 +368,43 @@ impl<T: Dist> TeamFrom<&[T]> for LamellarArrayRdmaInput<T> {
 }
 
 impl<T: Dist> TeamFrom<&LamellarArrayRdmaInput<T>> for LamellarArrayRdmaInput<T> {
-    fn team_from(lai: &LamellarArrayRdmaInput<T>, _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(lai: &LamellarArrayRdmaInput<T>, _team: &Arc<LamellarTeam>) -> Self {
         lai.clone()
     }
 }
 
 impl<T: Dist> TeamFrom<&LamellarArrayRdmaOutput<T>> for LamellarArrayRdmaOutput<T> {
-    fn team_from(lao: &LamellarArrayRdmaOutput<T>, _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(lao: &LamellarArrayRdmaOutput<T>, _team: &Arc<LamellarTeam>) -> Self {
         lao.clone()
     }
 }
 
 impl<T: Clone> TeamFrom<(&Vec<T>, Distribution)> for Vec<T> {
-    fn team_from(vals: (&Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(vals: (&Vec<T>, Distribution), _team: &Arc<LamellarTeam>) -> Self {
         vals.0.to_vec()
     }
 }
 
 impl<T: Clone> TeamFrom<(Vec<T>, Distribution)> for Vec<T> {
-    fn team_from(vals: (Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(vals: (Vec<T>, Distribution), _team: &Arc<LamellarTeam>) -> Self {
         vals.0.to_vec()
     }
 }
 
 impl<T: Dist> TeamTryFrom<&T> for LamellarArrayRdmaInput<T> {
-    fn team_try_from(val: &T, team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error> {
+    fn team_try_from(val: &T, team: &Arc<LamellarTeam>) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::team_from(val, team))
     }
 }
 
 impl<T: Dist> TeamTryFrom<T> for LamellarArrayRdmaInput<T> {
-    fn team_try_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error> {
+    fn team_try_from(val: T, team: &Arc<LamellarTeam>) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::team_from(val, team))
     }
 }
 
 impl<T: Dist> TeamTryFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
-    fn team_try_from(val: Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error> {
+    fn team_try_from(val: Vec<T>, team: &Arc<LamellarTeam>) -> Result<Self, anyhow::Error> {
         if val.len() == 0 {
             Err(anyhow::anyhow!(
                 "Trying to create an empty LamellarArrayRdmaInput"
@@ -416,7 +416,7 @@ impl<T: Dist> TeamTryFrom<Vec<T>> for LamellarArrayRdmaInput<T> {
 }
 
 impl<T: Dist> TeamTryFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
-    fn team_try_from(val: &Vec<T>, team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error> {
+    fn team_try_from(val: &Vec<T>, team: &Arc<LamellarTeam>) -> Result<Self, anyhow::Error> {
         if val.len() == 0 {
             Err(anyhow::anyhow!(
                 "Trying to create an empty LamellarArrayRdmaInput"
@@ -428,7 +428,7 @@ impl<T: Dist> TeamTryFrom<&Vec<T>> for LamellarArrayRdmaInput<T> {
 }
 
 impl<T: Dist> TeamTryFrom<&[T]> for LamellarArrayRdmaInput<T> {
-    fn team_try_from(val: &[T], team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error> {
+    fn team_try_from(val: &[T], team: &Arc<LamellarTeam>) -> Result<Self, anyhow::Error> {
         if val.len() == 0 {
             Err(anyhow::anyhow!(
                 "Trying to create an empty LamellarArrayRdmaInput"
@@ -442,7 +442,7 @@ impl<T: Dist> TeamTryFrom<&[T]> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamTryFrom<&LamellarArrayRdmaInput<T>> for LamellarArrayRdmaInput<T> {
     fn team_try_from(
         lai: &LamellarArrayRdmaInput<T>,
-        _team: &Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(lai.clone())
     }
@@ -451,7 +451,7 @@ impl<T: Dist> TeamTryFrom<&LamellarArrayRdmaInput<T>> for LamellarArrayRdmaInput
 impl<T: Dist> TeamTryFrom<&LamellarArrayRdmaOutput<T>> for LamellarArrayRdmaOutput<T> {
     fn team_try_from(
         lao: &LamellarArrayRdmaOutput<T>,
-        _team: &Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(lao.clone())
     }
@@ -460,7 +460,7 @@ impl<T: Dist> TeamTryFrom<&LamellarArrayRdmaOutput<T>> for LamellarArrayRdmaOutp
 impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
     fn team_try_from(
         vals: (&Vec<T>, Distribution),
-        _team: &Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(vals.0.to_vec())
     }
@@ -468,14 +468,14 @@ impl<T: Clone> TeamTryFrom<(&Vec<T>, Distribution)> for Vec<T> {
 
 // #[async_trait]
 // impl<T: Clone> AsyncTeamFrom<(&Vec<T>, Distribution)> for Vec<T> {
-//     async fn team_from(vals: (&Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+//     async fn team_from(vals: (&Vec<T>, Distribution), _team: &Arc<LamellarTeam>) -> Self {
 //         vals.0.to_vec()
 //     }
 // }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for Vec<T> {
-    async fn team_from(input: (Vec<T>, Distribution), _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), _team: &Arc<LamellarTeam>) -> Self {
         input.0
     }
 }
@@ -527,7 +527,7 @@ where
 /// Provides the same abstraction as the `From` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamFrom<T: ?Sized> {
     /// Converts to this type from the input type
-    fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Self;
+    fn team_from(val: T, team: &Arc<LamellarTeam>) -> Self;
 }
 
 // #[async_trait]
@@ -536,41 +536,41 @@ pub trait TeamFrom<T: ?Sized> {
 // pub trait AsyncTeamFrom<T: ?Sized>: TeamFrom<T> + Sized {
 pub trait AsyncTeamFrom<T: ?Sized>: Sized {
     /// Converts to this type from the input type
-    fn team_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> impl Future<Output = Self> + Send;
+    fn team_from(val: T, team: &Arc<LamellarTeam>) -> impl Future<Output = Self> + Send;
 }
 
 /// Provides the same abstraction as the `TryFrom` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamTryFrom<T: ?Sized> {
     /// Trys to convert to this type from the input type
-    fn team_try_from(val: T, team: &Pin<Arc<LamellarTeamRT>>) -> Result<Self, anyhow::Error>
+    fn team_try_from(val: T, team: &Arc<LamellarTeam>) -> Result<Self, anyhow::Error>
     where
         Self: Sized;
 }
 /// Provides the same abstraction as the `Into` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 pub trait TeamInto<T: ?Sized> {
     /// converts this type into the (usually inferred) input type
-    fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
+    fn team_into(self, team: &Arc<LamellarTeam>) -> T;
 }
 
 /// Provides the same abstraction as the `Into` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated to be used within an async context
 #[async_trait]
 pub trait AsyncTeamInto<T: ?Sized> {
     /// converts this type into the (usually inferred) input type
-    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> T;
+    async fn team_into(self, team: &Arc<LamellarTeam>) -> T;
 }
 
 /// Provides the same abstraction as the `TryInto` trait in the standard language, but with a `team` parameter so that lamellar memory regions can be allocated
 
 pub trait TeamTryInto<T>: Sized {
     /// Trys to convert this type into the (usually inferred) input type
-    fn team_try_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> Result<T, anyhow::Error>;
+    fn team_try_into(self, team: &Arc<LamellarTeam>) -> Result<T, anyhow::Error>;
 }
 
 impl<T, U> TeamInto<U> for T
 where
     U: TeamFrom<T>,
 {
-    fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> U {
+    fn team_into(self, team: &Arc<LamellarTeam>) -> U {
         U::team_from(self, team)
     }
 }
@@ -580,7 +580,7 @@ impl<T: Send, U> AsyncTeamInto<U> for T
 where
     U: AsyncTeamFrom<T>,
 {
-    async fn team_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> U {
+    async fn team_into(self, team: &Arc<LamellarTeam>) -> U {
         <U as AsyncTeamFrom<T>>::team_from(self, team).await
     }
 }
@@ -589,7 +589,7 @@ impl<T, U> TeamTryInto<U> for T
 where
     U: TeamTryFrom<T>,
 {
-    fn team_try_into(self, team: &Pin<Arc<LamellarTeamRT>>) -> Result<U, anyhow::Error> {
+    fn team_try_into(self, team: &Arc<LamellarTeam>) -> Result<U, anyhow::Error> {
         U::team_try_from(self, team)
     }
 }
@@ -660,29 +660,6 @@ impl LamellarByteArray {
             LamellarByteArray::GlobalLockArray(array) => array.array.inner.data.team(),
         }
     }
-    pub(crate) fn dec_outstanding(&self, num: usize) {
-        match self {
-            LamellarByteArray::UnsafeArray(array) => {
-                array.inner.data.array_counters.dec_outstanding(num)
-            }
-            LamellarByteArray::ReadOnlyArray(array) => {
-                array.array.inner.data.array_counters.dec_outstanding(num)
-            }
-            LamellarByteArray::AtomicArray(array) => array.dec_outstanding(num),
-            LamellarByteArray::NativeAtomicArray(array) => {
-                array.array.inner.data.array_counters.dec_outstanding(num)
-            }
-            LamellarByteArray::GenericAtomicArray(array) => {
-                array.array.inner.data.array_counters.dec_outstanding(num)
-            }
-            LamellarByteArray::LocalLockArray(array) => {
-                array.array.inner.data.array_counters.dec_outstanding(num)
-            }
-            LamellarByteArray::GlobalLockArray(array) => {
-                array.array.inner.data.array_counters.dec_outstanding(num)
-            }
-        }
-    }
 }
 
 impl<T: Dist + 'static> crate::active_messaging::DarcSerde for LamellarReadArray<T> {
@@ -823,6 +800,58 @@ impl<T: Dist> ActiveMessaging for LamellarReadArray<T> {
     }
 }
 
+impl<T: Dist> LamellarEnv for LamellarReadArray<T> {
+    fn my_pe(&self) -> usize {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.my_pe(),
+            LamellarReadArray::ReadOnlyArray(array) => array.my_pe(),
+            LamellarReadArray::AtomicArray(array) => array.my_pe(),
+            LamellarReadArray::LocalLockArray(array) => array.my_pe(),
+            LamellarReadArray::GlobalLockArray(array) => array.my_pe(),
+        }
+    }
+
+    fn num_pes(&self) -> usize {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.num_pes(),
+            LamellarReadArray::ReadOnlyArray(array) => array.num_pes(),
+            LamellarReadArray::AtomicArray(array) => array.num_pes(),
+            LamellarReadArray::LocalLockArray(array) => array.num_pes(),
+            LamellarReadArray::GlobalLockArray(array) => array.num_pes(),
+        }
+    }
+
+    fn num_threads_per_pe(&self) -> usize {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.num_threads_per_pe(),
+            LamellarReadArray::ReadOnlyArray(array) => array.num_threads_per_pe(),
+            LamellarReadArray::AtomicArray(array) => array.num_threads_per_pe(),
+            LamellarReadArray::LocalLockArray(array) => array.num_threads_per_pe(),
+            LamellarReadArray::GlobalLockArray(array) => array.num_threads_per_pe(),
+        }
+    }
+
+    fn world(&self) -> Arc<LamellarTeam> {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.world(),
+            LamellarReadArray::ReadOnlyArray(array) => array.world(),
+            LamellarReadArray::AtomicArray(array) => array.world(),
+            LamellarReadArray::LocalLockArray(array) => array.world(),
+            LamellarReadArray::GlobalLockArray(array) => array.world(),
+        }
+    }
+
+    fn team(&self) -> Arc<LamellarTeam> {
+        match self {
+            LamellarReadArray::UnsafeArray(array) => array.team(),
+            LamellarReadArray::ReadOnlyArray(array) => array.team(),
+            LamellarReadArray::AtomicArray(array) => array.team(),
+            LamellarReadArray::LocalLockArray(array) => array.team(),
+            LamellarReadArray::GlobalLockArray(array) => array.team(),
+        }
+    }
+}
+
 /// Represents the array types that allow write  operations
 #[enum_dispatch]
 #[derive(serde::Serialize, serde::Deserialize, Clone)]
@@ -964,6 +993,50 @@ impl<T: Dist> ActiveMessaging for LamellarWriteArray<T> {
     }
 }
 
+impl<T: Dist> LamellarEnv for LamellarWriteArray<T> {
+    fn my_pe(&self) -> usize {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.my_pe(),
+            LamellarWriteArray::AtomicArray(array) => array.my_pe(),
+            LamellarWriteArray::LocalLockArray(array) => array.my_pe(),
+            LamellarWriteArray::GlobalLockArray(array) => array.my_pe(),
+        }
+    }
+    fn num_pes(&self) -> usize {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.num_pes(),
+            LamellarWriteArray::AtomicArray(array) => array.num_pes(),
+            LamellarWriteArray::LocalLockArray(array) => array.num_pes(),
+            LamellarWriteArray::GlobalLockArray(array) => array.num_pes(),
+        }
+    }
+    fn num_threads_per_pe(&self) -> usize {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.num_threads_per_pe(),
+            LamellarWriteArray::AtomicArray(array) => array.num_threads_per_pe(),
+            LamellarWriteArray::LocalLockArray(array) => array.num_threads_per_pe(),
+            LamellarWriteArray::GlobalLockArray(array) => array.num_threads_per_pe(),
+        }
+    }
+    fn world(&self) -> Arc<LamellarTeam> {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.world(),
+            LamellarWriteArray::AtomicArray(array) => array.world(),
+            LamellarWriteArray::LocalLockArray(array) => array.world(),
+            LamellarWriteArray::GlobalLockArray(array) => array.world(),
+        }
+    }
+    fn team(&self) -> Arc<LamellarTeam> {
+        match self {
+            LamellarWriteArray::UnsafeArray(array) => array.team(),
+            LamellarWriteArray::AtomicArray(array) => array.team(),
+            LamellarWriteArray::LocalLockArray(array) => array.team(),
+            LamellarWriteArray::GlobalLockArray(array) => array.team(),
+        }
+    }
+}
+
+
 // private sealed trait
 #[doc(hidden)]
 pub trait InnerArray: Sized {
@@ -997,26 +1070,26 @@ pub(crate) mod private {
     //#[doc(hidden)]
     #[enum_dispatch(LamellarReadArray<T>,LamellarWriteArray<T>)]
     pub(crate) trait ArrayExecAm<T: Dist> {
-        fn team(&self) -> Pin<Arc<LamellarTeamRT>>;
+        fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>>;
         fn team_counters(&self) -> Arc<AMCounters>;
         fn exec_am_local_tg<F>(&self, am: F) -> LocalAmHandle<F::Output>
         where
             F: LamellarActiveMessage + LocalAM + 'static,
         {
-            self.team().exec_am_local_tg(am, Some(self.team_counters()))
+            self.team_rt().exec_am_local_tg(am, Some(self.team_counters()))
         }
         fn exec_am_pe_tg<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
         where
             F: RemoteActiveMessage + LamellarAM + AmDist,
         {
-            self.team()
+            self.team_rt()
                 .exec_am_pe_tg(pe, am, Some(self.team_counters()))
         }
         fn spawn_am_pe_tg<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
         where
             F: RemoteActiveMessage + LamellarAM + AmDist,
         {
-            self.team()
+            self.team_rt()
                 .spawn_am_pe_tg(pe, am, Some(self.team_counters()))
         }
         // fn exec_arc_am_pe<F>(&self, pe: usize, am: LamellarArcAm) -> AmHandle<F>
@@ -1030,7 +1103,7 @@ pub(crate) mod private {
         where
             F: RemoteActiveMessage + LamellarAM + AmDist,
         {
-            self.team().exec_am_all_tg(am, Some(self.team_counters()))
+            self.team_rt().exec_am_all_tg(am, Some(self.team_counters()))
         }
     }
 }
@@ -1038,8 +1111,8 @@ pub(crate) mod private {
 /// Represents a distributed array, providing some convenience functions for getting simple information about the array.
 /// This is mostly intended for use within the runtime (specifically for use in Proc Macros) but the available functions may be useful to endusers as well.
 #[enum_dispatch(LamellarReadArray<T>,LamellarWriteArray<T>)]
-pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessaging {
-    #[doc(alias("One-sided", "onesided"))]
+pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessaging + LamellarEnv {
+    // #[doc(alias("One-sided", "onesided"))]
     /// Returns the team used to construct this array, the PEs in the team represent the same PEs which have a slice of data of the array
     ///
     /// # One-sided Operation
@@ -1053,7 +1126,7 @@ pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessa
     ///
     /// let a_team = array.team();
     ///```
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>>; //todo turn this into Arc<LamellarTeam>
+    // fn team(&self) -> Arc<LamellarTeam>; //todo turn this into Arc<LamellarTeam>
 
     #[doc(alias("One-sided", "onesided"))]
     /// Return the total number of elements in this array
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 9d8c5504..9aa3041a 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -10,7 +10,6 @@ use crate::array::generic_atomic::{GenericAtomicElement, LocalGenericAtomicEleme
 use crate::array::iterator::distributed_iterator::DistIteratorLauncher;
 use crate::array::iterator::local_iterator::LocalIteratorLauncher;
 use crate::array::native_atomic::NativeAtomicElement;
-use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
 // use crate::darc::{Darc, DarcMode};
 use crate::barrier::BarrierHandle;
@@ -678,17 +677,6 @@ impl AtomicByteArray {
             AtomicByteArray::GenericAtomicByteArray(array) => array.array.inner.data.team(),
         }
     }
-
-    pub(crate) fn dec_outstanding(&self, num: usize) {
-        match self {
-            AtomicByteArray::NativeAtomicByteArray(array) => {
-                array.array.inner.data.array_counters.dec_outstanding(num)
-            }
-            AtomicByteArray::GenericAtomicByteArray(array) => {
-                array.array.inner.data.array_counters.dec_outstanding(num)
-            }
-        }
-    }
 }
 
 impl crate::active_messaging::DarcSerde for AtomicByteArray {
@@ -1178,34 +1166,16 @@ impl<T: Dist> AtomicArray<T> {
     }
 }
 
-// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
-//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-//         let (vals, distribution) = input;
-//         let input = (&vals, distribution);
-//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-//         array.into()
-//     }
-// }
+
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
-    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
         array.async_into().await
     }
 }
 
-// impl<T: Dist + 'static> From<UnsafeArray<T>> for AtomicArray<T> {
-//     fn from(array: UnsafeArray<T>) -> Self {
-//         // println!("Converting from UnsafeArray to AtomicArray");
-//         if NATIVE_ATOMICS.contains(&TypeId::of::<T>()) {
-//             NativeAtomicArray::from(array).into()
-//         } else {
-//             GenericAtomicArray::from(array).into()
-//         }
-//     }
-// }
-
 #[async_trait]
 impl<T: Dist + 'static> AsyncFrom<UnsafeArray<T>> for AtomicArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
@@ -1218,33 +1188,6 @@ impl<T: Dist + 'static> AsyncFrom<UnsafeArray<T>> for AtomicArray<T> {
     }
 }
 
-// impl<T: Dist + 'static> From<LocalOnlyArray<T>> for AtomicArray<T> {
-//     fn from(array: LocalOnlyArray<T>) -> Self {
-//         // println!("Converting from LocalOnlyArray to AtomicArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist + 'static> From<ReadOnlyArray<T>> for AtomicArray<T> {
-//     fn from(array: ReadOnlyArray<T>) -> Self {
-//         // println!("Converting from ReadOnlyArray to AtomicArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-// impl<T: Dist + 'static> From<LocalLockArray<T>> for AtomicArray<T> {
-//     fn from(array: LocalLockArray<T>) -> Self {
-//         // println!("Converting from LocalLockArray to AtomicArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist + 'static> From<GlobalLockArray<T>> for AtomicArray<T> {
-//     fn from(array: GlobalLockArray<T>) -> Self {
-//         // println!("Converting from GlobalLockArray to AtomicArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
 impl<T: Dist> From<AtomicArray<T>> for AtomicByteArray {
     fn from(array: AtomicArray<T>) -> Self {
         match array {
diff --git a/src/array/atomic/iteration.rs b/src/array/atomic/iteration.rs
index e32f1052..b9ab104b 100644
--- a/src/array/atomic/iteration.rs
+++ b/src/array/atomic/iteration.rs
@@ -9,6 +9,7 @@ use crate::array::iterator::{
 };
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
+use crate::array::private::ArrayExecAm;
 use crate::memregion::Dist;
 
 use self::iterator::IterLockFuture;
@@ -208,13 +209,13 @@ impl<T: Dist> LamellarArrayIterators<T> for AtomicArray<T> {
     }
 
     fn onesided_iter(&self) -> Self::OnesidedIter {
-        OneSidedIter::new(self.clone(), LamellarArray::team_rt(self).clone(), 1)
+        OneSidedIter::new(self.clone(), self.team_rt(), 1)
     }
 
     fn buffered_onesided_iter(&self, buf_size: usize) -> Self::OnesidedIter {
         OneSidedIter::new(
             self.clone(),
-            LamellarArray::team_rt(self).clone(),
+            self.team_rt(),
             std::cmp::min(buf_size, self.len()),
         )
     }
diff --git a/src/array/generic_atomic.rs b/src/array/generic_atomic.rs
index 64d69bb2..18a3cf27 100644
--- a/src/array/generic_atomic.rs
+++ b/src/array/generic_atomic.rs
@@ -599,18 +599,9 @@ impl<T: Dist + 'static> GenericAtomicArray<T> {
     }
 }
 
-// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
-//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-//         let (vals, distribution) = input;
-//         let input = (&vals, distribution);
-//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-//         array.into()
-//     }
-// }
-
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GenericAtomicArray<T> {
-    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
         array.async_into().await
     }
@@ -691,7 +682,7 @@ impl<T: Dist> From<GenericAtomicByteArray> for AtomicArray<T> {
 }
 
 impl<T: Dist> private::ArrayExecAm<T> for GenericAtomicArray<T> {
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
@@ -778,9 +769,9 @@ impl<T: Dist> ActiveMessaging for GenericAtomicArray<T> {
 }
 
 impl<T: Dist> LamellarArray<T> for GenericAtomicArray<T> {
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt()
-    }
+    // fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
+    //     self.array.team_rt()
+    // }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
     // }
diff --git a/src/array/generic_atomic/handle.rs b/src/array/generic_atomic/handle.rs
index f451d0b6..7635a64a 100644
--- a/src/array/generic_atomic/handle.rs
+++ b/src/array/generic_atomic/handle.rs
@@ -3,7 +3,6 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use super::{ArrayOps, GenericAtomicArray};
-use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 use crate::{Dist, LamellarTeamRT};
 
@@ -29,24 +28,6 @@ impl<T: Dist + ArrayOps + 'static> PinnedDrop for GenericAtomicArrayHandle<T> {
     }
 }
 
-impl<T: Dist + ArrayOps + 'static> GenericAtomicArrayHandle<T> {
-    pub(crate) fn block(mut self) -> GenericAtomicArray<T> {
-        self.launched = true;
-        RuntimeWarning::BlockingCall(
-            "GenericAtomicArrayHandle::block",
-            "<handle>.spawn() or<handle>.await",
-        )
-        .print();
-        self.team.clone().block_on(self)
-    }
-
-    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub(crate) fn spawn(mut self) -> LamellarTask<GenericAtomicArray<T>> {
-        self.launched = true;
-        self.team.clone().spawn(self)
-    }
-}
-
 impl<T: Dist + ArrayOps + 'static> Future for GenericAtomicArrayHandle<T> {
     type Output = GenericAtomicArray<T>;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
diff --git a/src/array/generic_atomic/rdma.rs b/src/array/generic_atomic/rdma.rs
index 1243e754..d1307840 100644
--- a/src/array/generic_atomic/rdma.rs
+++ b/src/array/generic_atomic/rdma.rs
@@ -45,7 +45,7 @@ impl<T: Dist> LamellarArrayGet<T> for GenericAtomicArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -84,7 +84,7 @@ impl<T: Dist> LamellarArrayPut<T> for GenericAtomicArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -261,7 +261,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                     }
                 }
                 Distribution::Cyclic => {
-                    let num_pes = ArrayExecAm::team(&self.array).num_pes();
+                    let num_pes = ArrayExecAm::team_rt(&self.array).num_pes();
                     let mut pe_u8_vecs: HashMap<usize, Vec<u8>> = HashMap::new();
                     let mut pe_t_slices: HashMap<usize, &mut [T]> = HashMap::new();
                     let buf_slice = self.buf.as_slice().expect("array data should exist on PE");
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index 14b146d0..ad18e847 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -7,7 +7,6 @@ mod iteration;
 pub(crate) mod operations;
 mod rdma;
 use crate::array::private::ArrayExecAm;
-use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::*;
 use crate::barrier::BarrierHandle;
@@ -723,18 +722,8 @@ impl<T: Dist + 'static> GlobalLockArray<T> {
     }
 }
 
-// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
-//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-//         let (vals, distribution) = input;
-//         let input = (&vals, distribution);
-//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-//         array.into()
-//     }
-// }
-
-// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for GlobalLockArray<T> {
-    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
         array.async_into().await
     }
@@ -754,33 +743,6 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
     }
 }
 
-// impl<T: Dist> From<LocalOnlyArray<T>> for GlobalLockArray<T> {
-//     fn from(array: LocalOnlyArray<T>) -> Self {
-//         // println!("GlobalLock from localonly");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<AtomicArray<T>> for GlobalLockArray<T> {
-//     fn from(array: AtomicArray<T>) -> Self {
-//         // println!("GlobalLock from atomic");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<ReadOnlyArray<T>> for GlobalLockArray<T> {
-//     fn from(array: ReadOnlyArray<T>) -> Self {
-//         // println!("GlobalLock from readonly");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<LocalLockArray<T>> for GlobalLockArray<T> {
-//     fn from(array: LocalLockArray<T>) -> Self {
-//         // println!("GlobalLock from LocalLockArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
 
 impl<T: Dist> From<GlobalLockArray<T>> for GlobalLockByteArray {
     fn from(array: GlobalLockArray<T>) -> Self {
@@ -819,7 +781,7 @@ impl<T: Dist> From<GlobalLockByteArray> for GlobalLockArray<T> {
 }
 
 impl<T: Dist> private::ArrayExecAm<T> for GlobalLockArray<T> {
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
@@ -906,9 +868,9 @@ impl<T: Dist> ActiveMessaging for GlobalLockArray<T> {
 }
 
 impl<T: Dist> LamellarArray<T> for GlobalLockArray<T> {
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt()
-    }
+    // fn team(&self) -> Arc<LamellarTeam> {
+    //     self.array.team()
+    // }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
     // }
diff --git a/src/array/global_lock_atomic/rdma.rs b/src/array/global_lock_atomic/rdma.rs
index dddbefcc..66e06b7e 100644
--- a/src/array/global_lock_atomic/rdma.rs
+++ b/src/array/global_lock_atomic/rdma.rs
@@ -55,7 +55,7 @@ impl<T: Dist> LamellarArrayGet<T> for GlobalLockArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -94,7 +94,7 @@ impl<T: Dist> LamellarArrayPut<T> for GlobalLockArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -257,7 +257,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                 }
                 Distribution::Cyclic => {
                     //TODO think about optimized put similar to Unsafe
-                    let num_pes = ArrayExecAm::team(&self.array).num_pes();
+                    let num_pes = ArrayExecAm::team_rt(&self.array).num_pes();
                     let mut pe_u8_vecs: HashMap<usize, Vec<u8>> = HashMap::new();
                     let mut pe_t_slices: HashMap<usize, &mut [T]> = HashMap::new();
                     let buf_slice = self.buf.as_slice().expect("array data should be on PE");
diff --git a/src/array/iterator/distributed_iterator.rs b/src/array/iterator/distributed_iterator.rs
index b14526db..74003464 100644
--- a/src/array/iterator/distributed_iterator.rs
+++ b/src/array/iterator/distributed_iterator.rs
@@ -40,12 +40,11 @@ use crate::active_messaging::SyncSend;
 use crate::array::iterator::{private::*, Schedule};
 use crate::array::{operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray, LamellarArray};
 use crate::memregion::Dist;
-use crate::LamellarTeamRT;
+use crate::LamellarTeam;
 
 use futures_util::Future;
 use paste::paste;
 use std::marker::PhantomData;
-use std::pin::Pin;
 use std::sync::Arc;
 
 use super::IterLockFuture;
@@ -140,7 +139,7 @@ pub trait DistIteratorLauncher: InnerArray {
     }
 
     //#[doc(hidden)]
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team(&self) -> Arc<LamellarTeam> {
         self.as_inner().team()
     }
 }
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 12afce3f..3e97f68d 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -12,6 +12,7 @@ use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
+use crate::lamellar_env::LamellarEnv;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
@@ -189,7 +190,7 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
         team: Pin<Arc<LamellarTeamRT>>,
     ) -> A {
         let input = (local_vals, dist);
-        let array: A = AsyncTeamInto::team_into(input, &team).await;
+        let array: A = AsyncTeamInto::team_into(input, &team.team()).await;
         array
     }
 
diff --git a/src/array/iterator/local_iterator.rs b/src/array/iterator/local_iterator.rs
index 1cc6194b..3aec1e1c 100644
--- a/src/array/iterator/local_iterator.rs
+++ b/src/array/iterator/local_iterator.rs
@@ -37,7 +37,7 @@ pub(crate) use consumer::*;
 use crate::array::iterator::{private::*, Schedule};
 use crate::array::{operations::ArrayOps, AsyncTeamFrom, Distribution, InnerArray, LamellarArray};
 use crate::memregion::Dist;
-use crate::LamellarTeamRT;
+use crate::LamellarTeam;
 
 use crate::active_messaging::SyncSend;
 
@@ -45,7 +45,6 @@ use enum_dispatch::enum_dispatch;
 use futures_util::Future;
 use paste::paste;
 use std::marker::PhantomData;
-use std::pin::Pin;
 use std::sync::Arc;
 
 use super::IterLockFuture;
@@ -139,7 +138,7 @@ pub trait LocalIteratorLauncher: InnerArray {
     }
 
     //#[doc(hidden)]
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team(&self) -> Arc<LamellarTeam> {
         self.as_inner().team()
     }
 }
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index 84eb41d1..f060533e 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -11,6 +11,7 @@ use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
+use crate::lamellar_env::LamellarEnv;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
@@ -190,7 +191,7 @@ impl<T: Dist + ArrayOps, A: AsyncTeamFrom<(Vec<T>, Distribution)> + SyncSend + '
         team: Pin<Arc<LamellarTeamRT>>,
     ) -> A {
         let input = (local_vals, dist);
-        AsyncTeamInto::team_into(input, &team).await
+        AsyncTeamInto::team_into(input, &team.team()).await
     }
     // fn create_array(&self, local_vals: Vec<T>) -> A {
     //     let input = (local_vals, self.distribution);
diff --git a/src/array/iterator/one_sided_iterator/chunks.rs b/src/array/iterator/one_sided_iterator/chunks.rs
index 40f53660..4ae204f6 100644
--- a/src/array/iterator/one_sided_iterator/chunks.rs
+++ b/src/array/iterator/one_sided_iterator/chunks.rs
@@ -1,14 +1,12 @@
 use crate::array::iterator::one_sided_iterator::{private::*, *};
-
 use crate::array::ArrayRdmaHandle;
 use crate::lamellar_request::LamellarRequest;
-// use crate::array::LamellarArrayRequest;
-// use crate::LamellarArray;
 use crate::memregion::OneSidedMemoryRegion;
+use crate::lamellar_env::LamellarEnv;
+
 use pin_project::pin_project;
 
-// use async_trait::async_trait;
-// use futures_util::Future;
+
 #[pin_project]
 pub struct Chunks<I>
 where
@@ -50,7 +48,7 @@ where
     ) -> (OneSidedMemoryRegion<I::ElemType>, ArrayRdmaHandle) {
         // println!(" get chunk of len: {:?}", size);
         let mem_region: OneSidedMemoryRegion<I::ElemType> =
-            array.team_rt().alloc_one_sided_mem_region(size);
+            array.team().team.alloc_one_sided_mem_region(size);
         // potentially unsafe depending on the array type (i.e. UnsafeArray - which requries unsafe to construct an iterator),
         // but safe with respect to the mem_region as this is the only reference
         let mut req = unsafe { array.internal_get(index, &mem_region) };
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 21a8f7c9..56fedf5a 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -10,7 +10,6 @@ use handle::{
 pub(crate) mod operations;
 mod rdma;
 use crate::array::private::ArrayExecAm;
-use crate::array::private::LamellarArrayPrivate;
 use crate::array::r#unsafe::{UnsafeByteArray, UnsafeByteArrayWeak};
 use crate::array::AsyncFrom;
 use crate::array::*;
@@ -691,18 +690,8 @@ impl<T: Dist + 'static> LocalLockArray<T> {
     }
 }
 
-// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
-//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-//         let (vals, distribution) = input;
-//         let input = (&vals, distribution);
-//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-//         array.into()
-//     }
-// }
-
-// #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for LocalLockArray<T> {
-    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
         array.async_into().await
     }
@@ -722,34 +711,6 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
     }
 }
 
-// impl<T: Dist> From<LocalOnlyArray<T>> for LocalLockArray<T> {
-//     fn from(array: LocalOnlyArray<T>) -> Self {
-//         // println!("locallock from localonly");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<AtomicArray<T>> for LocalLockArray<T> {
-//     fn from(array: AtomicArray<T>) -> Self {
-//         // println!("locallock from atomic");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<ReadOnlyArray<T>> for LocalLockArray<T> {
-//     fn from(array: ReadOnlyArray<T>) -> Self {
-//         // println!("locallock from readonly");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<GlobalLockArray<T>> for LocalLockArray<T> {
-//     fn from(array: GlobalLockArray<T>) -> Self {
-//         // println!("LocalLockArray from GlobalLockArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
 impl<T: Dist> From<LocalLockArray<T>> for LocalLockByteArray {
     fn from(array: LocalLockArray<T>) -> Self {
         LocalLockByteArray {
@@ -787,7 +748,7 @@ impl<T: Dist> From<LocalLockByteArray> for LocalLockArray<T> {
 }
 
 impl<T: Dist> private::ArrayExecAm<T> for LocalLockArray<T> {
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
@@ -874,9 +835,9 @@ impl<T: Dist> ActiveMessaging for LocalLockArray<T> {
 }
 
 impl<T: Dist> LamellarArray<T> for LocalLockArray<T> {
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt()
-    }
+    // fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
+    //     self.array.team_rt()
+    // }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
     // }
diff --git a/src/array/local_lock_atomic/rdma.rs b/src/array/local_lock_atomic/rdma.rs
index 3b3cf9d1..04ed74a4 100644
--- a/src/array/local_lock_atomic/rdma.rs
+++ b/src/array/local_lock_atomic/rdma.rs
@@ -48,7 +48,7 @@ impl<T: Dist> LamellarArrayGet<T> for LocalLockArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -87,7 +87,7 @@ impl<T: Dist> LamellarArrayPut<T> for LocalLockArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -243,7 +243,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                     }
                 }
                 Distribution::Cyclic => {
-                    let num_pes = ArrayExecAm::team(&self.array).num_pes();
+                    let num_pes = ArrayExecAm::team_rt(&self.array).num_pes();
                     let mut pe_u8_vecs: HashMap<usize, Vec<u8>> = HashMap::new();
                     let mut pe_t_slices: HashMap<usize, &mut [T]> = HashMap::new();
                     let buf_slice = self.buf.as_slice().expect("array data should be on PE");
diff --git a/src/array/local_only.rs b/src/array/local_only.rs
index 258d7856..001f577d 100644
--- a/src/array/local_only.rs
+++ b/src/array/local_only.rs
@@ -135,7 +135,7 @@ impl<T: Dist> From<GlobalLockArray<T>> for LocalOnlyArray<T> {
 }
 
 impl<T: Dist> private::ArrayExecAm<T> for LocalOnlyArray<T> {
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team().clone()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 7e035adf..2080e8ea 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1018,35 +1018,14 @@ impl<T: Dist> NativeAtomicArray<T> {
     }
 }
 
-// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
-//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-//         let (vals, distribution) = input;
-//         let input = (&vals, distribution);
-//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-//         array.into()
-//     }
-// }
-
-// #[async_trait]
+
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
-    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
         array.async_into().await
     }
 }
 
-//#[doc(hidden)]
-// impl<T: Dist> From<UnsafeArray<T>> for NativeAtomicArray<T> {
-//     fn from(array: UnsafeArray<T>) -> Self {
-//         // println!("native from unsafe");
-//         array.block_on_outstanding(DarcMode::NativeAtomicArray);
-
-//         NativeAtomicArray {
-//             array: array,
-//             orig_t: NativeAtomicType::of::<T>(),
-//         }
-//     }
-// }
 
 //#[doc(hidden)]
 #[async_trait]
@@ -1128,7 +1107,7 @@ impl<T: Dist> From<NativeAtomicByteArray> for AtomicArray<T> {
 
 // //#[doc(hidden)]
 impl<T: Dist> private::ArrayExecAm<T> for NativeAtomicArray<T> {
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
@@ -1217,9 +1196,9 @@ impl<T: Dist> ActiveMessaging for NativeAtomicArray<T> {
 
 //#[doc(hidden)]
 impl<T: Dist> LamellarArray<T> for NativeAtomicArray<T> {
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt()
-    }
+    // fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
+    //     self.array.team_rt()
+    // }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
     // }
diff --git a/src/array/native_atomic/handle.rs b/src/array/native_atomic/handle.rs
index b049348e..90b45225 100644
--- a/src/array/native_atomic/handle.rs
+++ b/src/array/native_atomic/handle.rs
@@ -3,7 +3,6 @@ use std::sync::Arc;
 use std::task::{Context, Poll};
 
 use super::{ArrayOps, NativeAtomicArray, NativeAtomicType};
-use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 use crate::{Dist, LamellarTeamRT, UnsafeArray};
 
@@ -29,24 +28,6 @@ impl<T: Dist + ArrayOps + 'static> PinnedDrop for NativeAtomicArrayHandle<T> {
     }
 }
 
-impl<T: Dist + ArrayOps + 'static> NativeAtomicArrayHandle<T> {
-    pub(crate) fn block(mut self) -> NativeAtomicArray<T> {
-        self.launched = true;
-        RuntimeWarning::BlockingCall(
-            "NativeAtomicArrayHandle::block",
-            "<handle>.spawn() or<handle>.await",
-        )
-        .print();
-        self.team.clone().block_on(self)
-    }
-
-   
-    #[must_use = "this function returns a future [LamellarTask] used to poll for completion. Call '.await' on the returned future in an async context or '.block()' in a non async context.  Alternatively it may be acceptable to call '.block()' instead of 'spawn()' on this handle"]
-    pub(crate) fn spawn(mut self) -> LamellarTask<NativeAtomicArray<T>> {
-        self.launched = true;
-        self.team.clone().spawn(self)
-    }
-}
 
 impl<T: Dist + ArrayOps + 'static> Future for NativeAtomicArrayHandle<T> {
     type Output = NativeAtomicArray<T>;
diff --git a/src/array/native_atomic/rdma.rs b/src/array/native_atomic/rdma.rs
index 84553794..b5887d51 100644
--- a/src/array/native_atomic/rdma.rs
+++ b/src/array/native_atomic/rdma.rs
@@ -44,7 +44,7 @@ impl<T: Dist> LamellarArrayGet<T> for NativeAtomicArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -83,7 +83,7 @@ impl<T: Dist> LamellarArrayPut<T> for NativeAtomicArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.array.team_rt()) {
+        match buf.team_try_into(&self.array.team()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -242,7 +242,7 @@ impl<T: Dist + 'static> LamellarAm for InitPutAm<T> {
                     }
                 }
                 Distribution::Cyclic => {
-                    let num_pes = ArrayExecAm::team(&self.array).num_pes();
+                    let num_pes = ArrayExecAm::team_rt(&self.array).num_pes();
                     let mut pe_u8_vecs: HashMap<usize, Vec<u8>> = HashMap::new();
                     let mut pe_t_slices: HashMap<usize, &mut [T]> = HashMap::new();
                     let buf_slice = self.buf.as_slice().expect("array data should be on PE");
diff --git a/src/array/operations/access.rs b/src/array/operations/access.rs
index a143169e..68bf7ccd 100644
--- a/src/array/operations/access.rs
+++ b/src/array/operations/access.rs
@@ -7,7 +7,8 @@ use super::handle::{
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for remotely writing elements
 ///
-/// These operations can be performed using any [LamellarWriteArray]  type
+/// These operations can be performed using any 'safe' [LamellarWriteArray]  type
+/// For UnsafeArrays please see [UnsafeAccessOps]
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -211,9 +212,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
 }
 
 #[doc(alias("One-sided", "onesided"))]
-/// The interface for remotely writing elements
-///
-/// These operations can be performed using any [LamellarWriteArray]  type
+/// The interface for remotely writing elements on [UnsafeArray]s
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -232,7 +231,7 @@ pub trait AccessOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
 /// For Ops that return results, the result will only be available on the calling PE.
 ///
 /// # Note
-/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur.
 ///
 /// # Batched Types
 /// Three types of batched operations can be performed
diff --git a/src/array/operations/arithmetic.rs b/src/array/operations/arithmetic.rs
index 608b6a47..a9123716 100644
--- a/src/array/operations/arithmetic.rs
+++ b/src/array/operations/arithmetic.rs
@@ -35,7 +35,8 @@ pub trait ElementArithmeticOps:
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for performing remote arithmetic operations on array elements
 ///
-/// These operations can be performed using any [LamellarWriteArray] type
+/// These operations can be performed using any safe [LamellarWriteArray] type
+/// for UnsafeArrays please see [UnsafeArithmeticOps] instead.
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -844,9 +845,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
 }
 
 #[doc(alias("One-sided", "onesided"))]
-/// The interface for performing remote arithmetic operations on array elements
-///
-/// These operations can be performed using any [LamellarWriteArray] type
+/// The interface for performing remote arithmetic operations on [UnsafeArray] elements
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -865,7 +864,7 @@ pub trait ArithmeticOps<T: Dist + ElementArithmeticOps>: private::LamellarArrayP
 /// For Ops that return results, the result will only be available on the calling PE.
 ///
 /// # Note
-/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur.
 ///
 /// # Batched Types
 /// Three types of batched operations can be performed
diff --git a/src/array/operations/bitwise.rs b/src/array/operations/bitwise.rs
index 96b09df7..123883c6 100644
--- a/src/array/operations/bitwise.rs
+++ b/src/array/operations/bitwise.rs
@@ -22,7 +22,8 @@ pub trait ElementBitWiseOps:
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for performing remote bitwise operations on array elements
 ///
-/// These operations can be performed using any [LamellarWriteArray] type
+/// These operations can be performed using any safe [LamellarWriteArray] type
+/// for UnsafeArrays please see [UnsafeBitWiseOps]
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -533,9 +534,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
 }
 
 #[doc(alias("One-sided", "onesided"))]
-/// The interface for performing remote bitwise operations on array elements
-///
-/// These operations can be performed using any [LamellarWriteArray] type
+/// The interface for performing remote bitwise operations on [UnsafeArray] elements
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -554,7 +553,7 @@ pub trait BitWiseOps<T: ElementBitWiseOps>: private::LamellarArrayPrivate<T> {
 /// For Ops that return results, the result will only be available on the calling PE.
 ///
 /// # Note
-/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur.
 ///
 /// # Batched Types
 /// Three types of batched operations can be performed
diff --git a/src/array/operations/compare_exchange.rs b/src/array/operations/compare_exchange.rs
index 364b0c92..8eaa5e85 100644
--- a/src/array/operations/compare_exchange.rs
+++ b/src/array/operations/compare_exchange.rs
@@ -25,7 +25,8 @@ pub trait ElementComparePartialEqOps:
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for performing remote compare and exchange operations on array elements
 ///
-/// These operations can be performed using any [LamellarWriteArray] type
+/// These operations can be performed using any safe [LamellarWriteArray] type
+/// for UnsafeArrays please see [UnsafeCompareExchangeOps] instead.
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -181,7 +182,8 @@ pub trait CompareExchangeOps<T: ElementCompareEqOps>: private::LamellarArrayPriv
 ///
 /// Useful for element types that only impl [PartialEq][std::cmp::PartialEq] instead of [Eq][std::cmp::Eq] (e.g `f32`,`f64`).
 ///
-/// These operations can be performed using any [LamellarWriteArray] type
+/// These operations can be performed using any safe [LamellarWriteArray] type
+/// for UnsafeArrays please see [UnsafeCompareExchangeOps] instead.
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -350,9 +352,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
 }
 
 #[doc(alias("One-sided", "onesided"))]
-/// The interface for performing remote compare and exchange operations on array elements
-///
-/// These operations can be performed using any [LamellarWriteArray] type
+/// The interface for performing remote compare and exchange operations on [UnsafeArray] elements
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -371,7 +371,7 @@ pub trait CompareExchangeEpsilonOps<T: ElementComparePartialEqOps>:
 /// For Ops that return results, the result will only be available on the calling PE.
 ///
 /// # Note
-/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though)
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur
 ///
 /// # Batched Types
 /// Three types of batched operations can be performed
@@ -511,12 +511,10 @@ pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
 }
 
 #[doc(alias("One-sided", "onesided"))]
-/// The interface for performing remote compare and exchange operations within a given epsilon on array elements
+/// The interface for performing remote compare and exchange operations within a given epsilon on [UnsafeArray] elements
 ///
 /// Useful for element types that only impl [PartialEq][std::cmp::PartialEq] instead of [Eq][std::cmp::Eq] (e.g `f32`,`f64`).
 ///
-/// These operations can be performed using any [LamellarWriteArray] type
-///
 /// Both single element operations and batched element operations are provided
 ///
 /// Generally if you are performing a large number of operations it will be better to
@@ -534,7 +532,7 @@ pub trait UnsafeCompareExchangeOps<T: ElementCompareEqOps>:
 /// For Ops that return results, the result will only be available on the calling PE.
 ///
 /// # Note
-/// For both single index and batched operations there are no guarantees to the order in which individual operations occur (an individal operation is guaranteed to be atomic though).
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur.
 ///
 /// # Batched Types
 /// Three types of batched operations can be performed
diff --git a/src/array/operations/read_only.rs b/src/array/operations/read_only.rs
index ccf516bd..56eda921 100644
--- a/src/array/operations/read_only.rs
+++ b/src/array/operations/read_only.rs
@@ -5,7 +5,8 @@ use super::handle::{ArrayFetchBatchOpHandle, ArrayFetchOpHandle};
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for remotely reading elements
 ///
-/// These operations can be performed using any LamellarArray type.
+/// These operations can be performed using any safe LamellarArray type.
+/// For UnsafeArrays please see [UnsafeReadOnlyOps]
 ///
 /// Both single element operations and batched element operations are provided
 ///
@@ -168,7 +169,7 @@ pub trait ReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
 /// }
 ///```
 pub trait UnsafeReadOnlyOps<T: ElementOps>: private::LamellarArrayPrivate<T> {
-    /// This call returns the value of the element at the specified index
+    /// This call returns the value of the element at the specified index for an [UnsafeArray]
     ///
     /// A future is returned as the result of this call, which is used to retrieve
     /// the result after the (possibly remote) operation as finished.
diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index 0b3b392b..44945950 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -16,8 +16,9 @@ pub trait ElementShiftOps: std::ops::ShlAssign + std::ops::ShrAssign + Dist + Si
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for performing remote Shift operations on array elements
 ///
-/// These operations can be performed using any [LamellarWriteArray] type
-///
+/// These operations can be performed using any safe [LamellarWriteArray] type
+/// for UnsafeArrays please see [UnsafeShiftOps] instead.
+///     
 /// Both single element operations and batched element operations are provided
 ///
 /// Generally if you are performing a large number of operations it will be better to
@@ -350,6 +351,42 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     }
 }
 
+
+#[doc(alias("One-sided", "onesided"))]
+/// The interface for performing remote Shift operations on [UnsafeArray] elements
+///
+/// Both single element operations and batched element operations are provided
+///
+/// Generally if you are performing a large number of operations it will be better to
+/// use a batched version instead of multiple single element opertations. While the
+/// Runtime internally performs message aggregation for both single element and batched
+/// operations, single element operates have to be treated as individual requests, resulting
+/// in allocation and bookkeeping overheads. A single batched call on the other hand is treated
+/// as a single request by the runtime. (See [ReadOnlyOps] for an example comparing single vs batched load operations of a list of indices)
+///
+/// The results of a batched operation are returned to the user in the same order as the input indices.
+///
+/// # One-sided Operation
+/// performing either single or batched operations are both one-sided, with the calling PE performing any necessary work to
+/// initate and execute active messages that are sent to remote PEs.
+/// For Ops that return results, the result will only be available on the calling PE.
+///
+/// # Note
+/// For both single index and batched operations there are no guarantees to the order in which individual operations occur
+///
+/// # Batched Types
+/// One type of batched operation can be performed
+/// ## One Value - Many Indicies
+/// In this type, the same value will be applied to the provided indices
+///```
+/// use lamellar::array::prelude::*;
+///
+/// let world = LamellarWorldBuilder::new().build();
+/// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
+///
+/// let indices = vec![3,54,12,88,29,68];
+/// array.block_on(array.batch_fetch_shl(indices,2));
+///```
 pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// This call performs an in place left shift of `val` bits on the element specified by `index`.
     ///
diff --git a/src/array/prelude.rs b/src/array/prelude.rs
index 0823236a..0d401297 100644
--- a/src/array/prelude.rs
+++ b/src/array/prelude.rs
@@ -43,7 +43,6 @@ pub use crate::active_messaging::ActiveMessaging;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
 //#[doc(hidden)]
-pub use crate::lamellar_team::LamellarTeamRT;
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
 pub use crate::memregion::{Dist, RegisteredMemoryRegion};
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index b0969e2f..b6b1e90e 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -6,7 +6,6 @@ pub(crate) mod local_chunks;
 pub use local_chunks::ReadOnlyLocalChunks;
 mod rdma;
 use crate::array::private::ArrayExecAm;
-use crate::array::private::LamellarArrayPrivate;
 use crate::array::*;
 use crate::barrier::BarrierHandle;
 use crate::darc::DarcMode;
@@ -346,38 +345,15 @@ impl<T: Dist + 'static> ReadOnlyArray<T> {
     }
 }
 
-// impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
-//     fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-//         let (vals, distribution) = input;
-//         let input = (&vals, distribution);
-//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-//         array.into()
-//     }
-// }
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
-    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
         array.async_into().await
     }
 }
 
-// impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for ReadOnlyArray<T> {
-//     fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
-//         let array: UnsafeArray<T> = TeamInto::team_into(input, team);
-//         array.into()
-//     }
-// }
-
-// impl<T: Dist> From<UnsafeArray<T>> for ReadOnlyArray<T> {
-//     fn from(array: UnsafeArray<T>) -> Self {
-//         // println!("readonly from UnsafeArray");
-//         array.block_on_outstanding(DarcMode::ReadOnlyArray);
-
-//         ReadOnlyArray { array: array }
-//     }
-// }
 
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
@@ -389,34 +365,6 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
     }
 }
 
-// impl<T: Dist> From<LocalOnlyArray<T>> for ReadOnlyArray<T> {
-//     fn from(array: LocalOnlyArray<T>) -> Self {
-//         // println!("readonly from LocalOnlyArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<AtomicArray<T>> for ReadOnlyArray<T> {
-//     fn from(array: AtomicArray<T>) -> Self {
-//         // println!("readonly from AtomicArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<LocalLockArray<T>> for ReadOnlyArray<T> {
-//     fn from(array: LocalLockArray<T>) -> Self {
-//         // println!("readonly from LocalLockArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
-// impl<T: Dist> From<GlobalLockArray<T>> for ReadOnlyArray<T> {
-//     fn from(array: GlobalLockArray<T>) -> Self {
-//         // println!("readonly from GlobalLockArray");
-//         unsafe { array.into_inner().into() }
-//     }
-// }
-
 impl<T: Dist> From<ReadOnlyArray<T>> for ReadOnlyByteArray {
     fn from(array: ReadOnlyArray<T>) -> Self {
         ReadOnlyByteArray {
@@ -603,7 +551,7 @@ impl<T: Dist + AmDist + ElementComparePartialEqOps + 'static> ReadOnlyArray<T> {
 }
 
 impl<T: Dist> private::ArrayExecAm<T> for ReadOnlyArray<T> {
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.array.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
@@ -690,9 +638,9 @@ impl<T: Dist> ActiveMessaging for ReadOnlyArray<T> {
 }
 
 impl<T: Dist> LamellarArray<T> for ReadOnlyArray<T> {
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.array.team_rt()
-    }
+    // fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
+    //     self.array.team_rt()
+    // }
     // fn my_pe(&self) -> usize {
     //     LamellarArray::my_pe(&self.array)
     // }
diff --git a/src/array/unsafe.rs b/src/array/unsafe.rs
index cd0cb4b3..8705c3f0 100644
--- a/src/array/unsafe.rs
+++ b/src/array/unsafe.rs
@@ -489,16 +489,6 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         // println!("done in wait all {:?}",std::time::SystemTime::now());
     }
 
-    pub(crate) fn block_on_outstanding(&self, mode: DarcMode) {
-        self.wait_all();
-        // println!("block on outstanding");
-        // self.inner.data.print();
-        // let the_array: UnsafeArray<T> = self.clone();
-        let array_darc = self.inner.data.clone();
-        self.team_rt()
-            .block_on(array_darc.block_on_outstanding(mode, 1)); //one for this instance of the array
-    }
-
     pub(crate) async fn await_on_outstanding(&self, mode: DarcMode) {
         self.await_all().await;
         // println!("block on outstanding");
@@ -660,10 +650,6 @@ impl<T: Dist + 'static> UnsafeArray<T> {
         }
     }
 
-    pub(crate) fn tasking_barrier(&self) {
-        self.inner.data.team.tasking_barrier();
-    }
-
     pub(crate) fn async_barrier(&self) -> BarrierHandle {
         self.inner.data.team.async_barrier()
     }
@@ -730,7 +716,7 @@ impl<T: Dist + 'static> UnsafeArray<T> {
 // }
 
 impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
-    fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let (vals, distribution) = input;
         let input = (&vals, distribution);
         TeamInto::team_into(input, team)
@@ -739,8 +725,9 @@ impl<T: Dist + ArrayOps> TeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
 
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T> {
-    async fn team_from(input: (Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let (local_vals, distribution) = input;
+        let team = team.team.clone();
         // println!("local_vals len: {:?}", local_vals.len());
         team.async_barrier().await;
         let local_sizes = UnsafeArray::<usize>::async_new(
@@ -788,8 +775,9 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for UnsafeArray<T
 }
 
 impl<T: Dist + ArrayOps> TeamFrom<(&Vec<T>, Distribution)> for UnsafeArray<T> {
-    fn team_from(input: (&Vec<T>, Distribution), team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(input: (&Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let (local_vals, distribution) = input;
+        let team = team.team.clone();
         // println!("local_vals len: {:?}", local_vals.len());
         // team.tasking_barrier();
         let local_sizes =
@@ -945,7 +933,7 @@ impl<T: Dist> From<LamellarByteArray> for UnsafeArray<T> {
 }
 
 impl<T: Dist> ArrayExecAm<T> for UnsafeArray<T> {
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
+    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
         self.team_rt()
     }
     fn team_counters(&self) -> Arc<AMCounters> {
@@ -1141,9 +1129,9 @@ impl<T: Dist> ActiveMessaging for UnsafeArray<T> {
 }
 
 impl<T: Dist> LamellarArray<T> for UnsafeArray<T> {
-    fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.inner.data.team.clone()
-    }
+    // fn team_rt(&self) -> Pin<Arc<LamellarTeamRT>> {
+    //     self.inner.data.team.clone()
+    // }
 
     // fn my_pe(&self) -> usize {
     //     self.inner.data.my_pe
diff --git a/src/array/unsafe/iteration/consumer.rs b/src/array/unsafe/iteration/consumer.rs
index 3fa4332f..237a01ae 100644
--- a/src/array/unsafe/iteration/consumer.rs
+++ b/src/array/unsafe/iteration/consumer.rs
@@ -36,7 +36,7 @@ impl UnsafeArrayInner {
                 worker += 1;
             }
         }
-        cons.create_handle(self.data.team.clone(), reqs)
+        cons.create_handle(self.data.team().clone(), reqs)
     }
 
     pub(crate) fn sched_dynamic<C, AmO, O, I>(&self, cons: C) -> C::Handle
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index d09ecd19..6ac4c458 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -4,13 +4,13 @@ use crate::array::iterator::private::Sealed;
 use crate::array::iterator::Schedule;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, InnerArray};
-use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
+use crate::LamellarTeam;
+use crate::lamellar_env::LamellarEnv;
 
 use core::marker::PhantomData;
 use futures_util::Future;
 use paste::paste;
-use std::pin::Pin;
 use std::sync::Arc;
 
 impl<T> InnerArray for UnsafeArray<T> {
@@ -190,7 +190,7 @@ impl DistIteratorLauncher for UnsafeArrayInner {
         [iter.lock_if_needed(Sealed)]
     );
 
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.data.team.clone()
+    fn team(&self) -> Arc<LamellarTeam> {
+        self.data.team.team()
     }
 }
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index fd722813..bccd3b83 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -3,15 +3,14 @@ use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::private::*;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
-
+use crate::LamellarTeam;
 use crate::array::iterator::Schedule;
-use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
+use crate::lamellar_env::LamellarEnv;
 
 use core::marker::PhantomData;
 use futures_util::Future;
 use paste::paste;
-use std::pin::Pin;
 use std::sync::Arc;
 
 impl<T: Dist> LocalIteratorLauncher for UnsafeArray<T> {}
@@ -170,7 +169,7 @@ impl LocalIteratorLauncher for UnsafeArrayInner {
         [iter.lock_if_needed(Sealed)]
     );
 
-    fn team(&self) -> Pin<Arc<LamellarTeamRT>> {
-        self.data.team.clone()
+    fn team(&self) -> Arc<LamellarTeam> {
+        self.data.team.team()
     }
 }
diff --git a/src/array/unsafe/rdma.rs b/src/array/unsafe/rdma.rs
index 6254bf28..0eeb7cad 100644
--- a/src/array/unsafe/rdma.rs
+++ b/src/array/unsafe/rdma.rs
@@ -416,7 +416,7 @@ impl<T: Dist> UnsafeArray<T> {
         index: usize,
         buf: U,
     ) {
-        match buf.team_try_into(&self.inner.data.team) {
+        match buf.team_try_into(&self.inner.data.team.team()) {
             Ok(buf) => match self.inner.distribution {
                 Distribution::Block => {
                     self.block_op(ArrayRdmaCmd::Put, index, buf);
@@ -493,7 +493,7 @@ impl<T: Dist> UnsafeArray<T> {
         index: usize,
         buf: U,
     ) {
-        match buf.team_try_into(&self.inner.data.team) {
+        match buf.team_try_into(&self.inner.data.team.team()) {
             Ok(buf) => match self.inner.distribution {
                 Distribution::Block => {
                     self.block_op(ArrayRdmaCmd::Get(false), index, buf);
@@ -564,7 +564,7 @@ impl<T: Dist> UnsafeArray<T> {
         buf: U,
     ) {
         // println!("unsafe iget {:?}",index);
-        if let Ok(buf) = buf.team_try_into(&self.inner.data.team) {
+        if let Ok(buf) = buf.team_try_into(&self.inner.data.team.team()) {
             match self.inner.distribution {
                 Distribution::Block => self.block_op(ArrayRdmaCmd::Get(true), index, buf),
                 Distribution::Cyclic => self.cyclic_op(ArrayRdmaCmd::Get(true), index, buf),
@@ -628,7 +628,7 @@ impl<T: Dist> UnsafeArray<T> {
     where
         U: TeamTryInto<LamellarArrayRdmaOutput<T>>,
     {
-        match buf.team_try_into(&self.team_rt()) {
+        match buf.team_try_into(&self.inner.data.team.team()) {
             Ok(buf) => self.internal_get(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
@@ -771,7 +771,7 @@ impl<T: Dist> LamellarArrayPut<T> for UnsafeArray<T> {
         index: usize,
         buf: U,
     ) -> ArrayRdmaHandle {
-        match buf.team_try_into(&self.team_rt()) {
+        match buf.team_try_into(&self.inner.data.team.team()) {
             Ok(buf) => self.internal_put(index, buf),
             Err(_) => ArrayRdmaHandle {
                 array: self.as_lamellar_byte_array(),
diff --git a/src/darc.rs b/src/darc.rs
index 072592c3..24746832 100644
--- a/src/darc.rs
+++ b/src/darc.rs
@@ -888,66 +888,6 @@ impl<T: 'static> DarcInner<T> {
         // self.debug_print();
     }
 
-    pub(crate) fn wait_all(&self) {
-        // println!("wait_all called on pe: {}", self.world_pe);
-
-        RuntimeWarning::BlockingCall("wait_all", "await_all().await").print();
-        let am_counters = self.am_counters();
-
-        let mut temp_now = Instant::now();
-        let mut orig_reqs = am_counters.send_req_cnt.load(Ordering::SeqCst);
-        let mut orig_launched = am_counters.launched_req_cnt.load(Ordering::SeqCst);
-
-        // println!(
-        //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
-        //     self.world_pe,
-        //     self.am_counters.send_req_cnt.load(Ordering::SeqCst),
-        //     self.am_counters.outstanding_reqs.load(Ordering::SeqCst),
-        // );
-        while self.team().panic.load(Ordering::SeqCst) == 0
-            && (am_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
-                || orig_reqs != am_counters.send_req_cnt.load(Ordering::SeqCst)
-                || orig_launched != am_counters.launched_req_cnt.load(Ordering::SeqCst))
-        {
-            orig_reqs = am_counters.send_req_cnt.load(Ordering::SeqCst);
-            orig_launched = am_counters.launched_req_cnt.load(Ordering::SeqCst);
-            // std::thread::yield_now();
-            // self.flush();
-            if std::thread::current().id() != *crate::MAIN_THREAD {
-                self.team().scheduler.exec_task()
-            }; //mmight as well do useful work while we wait }
-            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
-                println!(
-                    "in team wait_all mype: {:?} cnt: {:?} {:?}",
-                    self.team().world_pe,
-                    am_counters.send_req_cnt.load(Ordering::SeqCst),
-                    am_counters.outstanding_reqs.load(Ordering::SeqCst),
-                );
-                temp_now = Instant::now();
-            }
-        }
-        if am_counters.send_req_cnt.load(Ordering::SeqCst)
-            != am_counters.launched_req_cnt.load(Ordering::SeqCst)
-        {
-            println!(
-                "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
-                self.team().world_pe,
-                am_counters.send_req_cnt.load(Ordering::SeqCst),
-                am_counters.outstanding_reqs.load(Ordering::SeqCst),
-                am_counters.launched_req_cnt.load(Ordering::SeqCst)
-            );
-            RuntimeWarning::UnspawnedTask(
-                "`wait_all` before all tasks/active messages have been spawned",
-            )
-            .print();
-        }
-        // println!(
-        //     "in team wait_all mype: {:?} cnt: {:?} {:?}",
-        //     self.world_pe,
-        //     self.am_counters.send_req_cnt.load(Ordering::SeqCst),
-        //     self.am_counters.outstanding_reqs.load(Ordering::SeqCst),
-        // );
-    }
     pub(crate) async fn await_all(&self) {
         let mut temp_now = Instant::now();
         let am_counters = self.am_counters();
diff --git a/src/darc/prelude.rs b/src/darc/prelude.rs
index e9ca88fa..df92f853 100644
--- a/src/darc/prelude.rs
+++ b/src/darc/prelude.rs
@@ -10,7 +10,6 @@ pub use crate::active_messaging::ActiveMessaging;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
 //#[doc(hidden)]
-pub use crate::lamellar_team::LamellarTeamRT;
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
 pub use crate::LamellarEnv;
diff --git a/src/lamellae/command_queues.rs b/src/lamellae/command_queues.rs
index a93f0297..b34ef295 100644
--- a/src/lamellae/command_queues.rs
+++ b/src/lamellae/command_queues.rs
@@ -223,7 +223,7 @@ impl Drop for CmdBuf {
     fn drop(&mut self) {
         // println!("dropping cmd buf");
         let old = std::mem::take(&mut self.buf);
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         // println!("dropped cmd buf");
     }
 }
@@ -1136,19 +1136,19 @@ impl Drop for InnerCQ {
         // println!("dropping InnerCQ");
         let mut send_buf = self.send_buffer.lock();
         let old = std::mem::take(&mut *send_buf);
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         let mut recv_buf = self.recv_buffer.lock();
         let old = std::mem::take(&mut *recv_buf);
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         let mut free_buf = self.free_buffer.lock();
         let old = std::mem::take(&mut *free_buf);
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         let mut alloc_buffer = self.alloc_buffer.lock();
         let old = std::mem::take(&mut *alloc_buffer);
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         let mut panic_buffer = self.panic_buffer.lock();
         let old = std::mem::take(&mut *panic_buffer);
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         let old = std::mem::replace(
             Arc::get_mut(&mut self.release_cmd).unwrap(),
             Box::new(CmdMsg {
@@ -1159,7 +1159,7 @@ impl Drop for InnerCQ {
                 cmd_hash: 0,
             }),
         );
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         let old = std::mem::replace(
             Arc::get_mut(&mut self.clear_cmd).unwrap(),
             Box::new(CmdMsg {
@@ -1170,7 +1170,7 @@ impl Drop for InnerCQ {
                 cmd_hash: 0,
             }),
         );
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         let old = std::mem::replace(
             Arc::get_mut(&mut self.free_cmd).unwrap(),
             Box::new(CmdMsg {
@@ -1181,7 +1181,7 @@ impl Drop for InnerCQ {
                 cmd_hash: 0,
             }),
         );
-        Box::into_raw(old);
+        let _ = Box::into_raw(old);
         self.cmd_buffers.clear();
         // println!("dropped InnerCQ");
     }
diff --git a/src/lamellar_alloc.rs b/src/lamellar_alloc.rs
index dd16c019..10d686d9 100644
--- a/src/lamellar_alloc.rs
+++ b/src/lamellar_alloc.rs
@@ -21,6 +21,7 @@ pub(crate) trait LamellarAlloc {
 }
 
 #[derive(Debug)]
+#[allow(dead_code)]
 struct Vma {
     addr: usize,
     padding: usize,
@@ -28,6 +29,7 @@ struct Vma {
 }
 
 #[derive(Clone)]
+#[allow(dead_code)]
 pub(crate) struct LinearAlloc {
     entries: Arc<(Mutex<Vec<Vma>>, Condvar)>,
     start_addr: usize,
@@ -504,6 +506,7 @@ impl LamellarAlloc for BTreeAlloc {
 }
 
 #[derive(Clone)]
+#[allow(dead_code)]
 pub(crate) struct ObjAlloc<T: Copy> {
     free_entries: Arc<(Mutex<Vec<usize>>, Condvar)>,
     start_addr: usize,
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 93c4c7f0..09938528 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -769,7 +769,7 @@ impl From<Pin<Arc<LamellarTeamRT>>> for LamellarTeamRemotePtr {
 /// Internal Runtime handle to a lamellar team
 /// this is typically used by proc macros (hence why it is public)
 /// end users should never use this directly and should instead use the [LamellarTeam] and/or [LamellarWorld] struct
-pub struct LamellarTeamRT {
+pub(crate) struct LamellarTeamRT {
     #[allow(dead_code)]
     pub(crate) world: Option<Pin<Arc<LamellarTeamRT>>>,
     parent: Option<Pin<Arc<LamellarTeamRT>>>,
@@ -1061,20 +1061,16 @@ impl LamellarTeamRT {
         // println!("team destroyed")
     }
     #[allow(dead_code)]
-    pub fn get_pes(&self) -> Vec<usize> {
+    pub(crate) fn get_pes(&self) -> Vec<usize> {
         self.arch.team_iter().collect::<Vec<usize>>()
     }
 
-    pub fn world_pe_id(&self) -> usize {
-        self.world_pe
-    }
-
-    pub fn team_pe_id(&self) -> Result<usize, IdError> {
+     pub(crate) fn team_pe_id(&self) -> Result<usize, IdError> {
         self.arch.team_pe(self.world_pe)
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn create_subteam_from_arch<L>(
+    pub(crate) fn create_subteam_from_arch<L>(
         world: Pin<Arc<LamellarTeamRT>>,
         parent: Pin<Arc<LamellarTeamRT>>,
         arch: L,
@@ -1219,12 +1215,12 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn num_pes(&self) -> usize {
+    pub(crate) fn num_pes(&self) -> usize {
         self.arch.num_pes()
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn num_threads(&self) -> usize {
+    pub(crate) fn num_threads(&self) -> usize {
         self.scheduler.num_workers() + 1 // plus one for the main thread
     }
 
@@ -1360,7 +1356,7 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn print_arch(&self) {
+     pub(crate) fn print_arch(&self) {
         println!("-----mapping of team pe ids to parent pe ids-----");
         let mut parent = format!("");
         let mut team = format!("");
@@ -1568,7 +1564,7 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn exec_am_all<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> MultiAmHandle<F::Output>
+    pub(crate) fn exec_am_all<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> MultiAmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + AmDist,
     {
@@ -1714,7 +1710,7 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn exec_am_pe<F>(self: &Pin<Arc<LamellarTeamRT>>, pe: usize, am: F) -> AmHandle<F::Output>
+    pub(crate) fn exec_am_pe<F>(self: &Pin<Arc<LamellarTeamRT>>, pe: usize, am: F) -> AmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + AmDist,
     {
@@ -2124,7 +2120,7 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub fn exec_am_local<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> LocalAmHandle<F::Output>
+    pub(crate) fn exec_am_local<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
@@ -2215,20 +2211,6 @@ impl LamellarTeamRT {
     //     mr
     // }
 
-    /// allocate a local memory region from the asymmetric heap
-    ///
-    /// # Arguments
-    ///
-    /// * `size` - number of elements of T to allocate a memory region for -- (not size in bytes)
-    ///
-    //#[tracing::instrument(skip_all)]
-    pub fn try_alloc_one_sided_mem_region<T: Dist>(
-        self: &Pin<Arc<LamellarTeamRT>>,
-        size: usize,
-    ) -> Result<OneSidedMemoryRegion<T>, anyhow::Error> {
-        OneSidedMemoryRegion::try_new(size, self, self.lamellae.clone())
-    }
-
     /// allocate a local memory region from the asymmetric heap
     ///
     /// # Arguments
diff --git a/src/lib.rs b/src/lib.rs
index 8b76141a..179e65c4 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -286,7 +286,8 @@ pub use crate::lamellar_task_group::{
 };
 pub use crate::lamellar_team::LamellarTeam;
 // //#[doc(hidden)]
-pub use crate::lamellar_team::{ArcLamellarTeam, LamellarTeamRT};
+pub use crate::lamellar_team::{ArcLamellarTeam}; 
+pub(crate) use crate::lamellar_team::{LamellarTeamRT};
 pub use crate::lamellar_world::*;
 pub use crate::scheduler::ExecutorType;
 
diff --git a/src/memregion.rs b/src/memregion.rs
index 5851ed23..9faa1f23 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -193,14 +193,14 @@ impl<T: Dist> From<&LamellarMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
 
 impl<T: Dist> TeamFrom<&LamellarMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
     //#[tracing::instrument(skip_all)]
-    fn team_from(mr: &LamellarMemoryRegion<T>, _team: &std::pin::Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(mr: &LamellarMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaInput::LamellarMemRegion(mr.clone())
     }
 }
 
 impl<T: Dist> TeamFrom<LamellarMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
     //#[tracing::instrument(skip_all)]
-    fn team_from(mr: LamellarMemoryRegion<T>, _team: &std::pin::Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(mr: LamellarMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaInput::LamellarMemRegion(mr)
     }
 }
@@ -209,7 +209,7 @@ impl<T: Dist> TeamTryFrom<&LamellarMemoryRegion<T>> for LamellarArrayRdmaInput<T
     //#[tracing::instrument(skip_all)]
     fn team_try_from(
         mr: &LamellarMemoryRegion<T>,
-        _team: &std::pin::Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::LamellarMemRegion(mr.clone()))
     }
@@ -219,7 +219,7 @@ impl<T: Dist> TeamTryFrom<LamellarMemoryRegion<T>> for LamellarArrayRdmaInput<T>
     //#[tracing::instrument(skip_all)]
     fn team_try_from(
         mr: LamellarMemoryRegion<T>,
-        _team: &std::pin::Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::LamellarMemRegion(mr))
     }
@@ -234,14 +234,14 @@ impl<T: Dist> From<&LamellarMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
 
 impl<T: Dist> TeamFrom<&LamellarMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
     //#[tracing::instrument(skip_all)]
-    fn team_from(mr: &LamellarMemoryRegion<T>, _team: &std::pin::Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(mr: &LamellarMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaOutput::LamellarMemRegion(mr.clone())
     }
 }
 
 impl<T: Dist> TeamFrom<LamellarMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
     //#[tracing::instrument(skip_all)]
-    fn team_from(mr: LamellarMemoryRegion<T>, _team: &std::pin::Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(mr: LamellarMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaOutput::LamellarMemRegion(mr)
     }
 }
@@ -250,7 +250,7 @@ impl<T: Dist> TeamTryFrom<&LamellarMemoryRegion<T>> for LamellarArrayRdmaOutput<
     //#[tracing::instrument(skip_all)]
     fn team_try_from(
         mr: &LamellarMemoryRegion<T>,
-        _team: &std::pin::Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaOutput::LamellarMemRegion(mr.clone()))
     }
@@ -260,7 +260,7 @@ impl<T: Dist> TeamTryFrom<LamellarMemoryRegion<T>> for LamellarArrayRdmaOutput<T
     //#[tracing::instrument(skip_all)]
     fn team_try_from(
         mr: LamellarMemoryRegion<T>,
-        _team: &std::pin::Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaOutput::LamellarMemRegion(mr))
     }
diff --git a/src/memregion/one_sided.rs b/src/memregion/one_sided.rs
index a669be87..2c9924de 100644
--- a/src/memregion/one_sided.rs
+++ b/src/memregion/one_sided.rs
@@ -987,13 +987,13 @@ impl<T: Dist> From<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
 }
 
 impl<T: Dist> TeamFrom<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
-    fn team_from(smr: &OneSidedMemoryRegion<T>, _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(smr: &OneSidedMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaInput::LocalMemRegion(smr.clone())
     }
 }
 
 impl<T: Dist> TeamFrom<OneSidedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
-    fn team_from(smr: OneSidedMemoryRegion<T>, _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(smr: OneSidedMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaInput::LocalMemRegion(smr)
     }
 }
@@ -1005,13 +1005,13 @@ impl<T: Dist> From<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
 }
 
 impl<T: Dist> TeamFrom<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
-    fn team_from(smr: &OneSidedMemoryRegion<T>, _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(smr: &OneSidedMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaOutput::LocalMemRegion(smr.clone())
     }
 }
 
 impl<T: Dist> TeamFrom<OneSidedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
-    fn team_from(smr: OneSidedMemoryRegion<T>, _team: &Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(smr: OneSidedMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaOutput::LocalMemRegion(smr)
     }
 }
@@ -1019,7 +1019,7 @@ impl<T: Dist> TeamFrom<OneSidedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
 impl<T: Dist> TeamTryFrom<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
     fn team_try_from(
         smr: &OneSidedMemoryRegion<T>,
-        _team: &Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::LocalMemRegion(smr.clone()))
     }
@@ -1028,7 +1028,7 @@ impl<T: Dist> TeamTryFrom<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaInput<T
 impl<T: Dist> TeamTryFrom<OneSidedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
     fn team_try_from(
         smr: OneSidedMemoryRegion<T>,
-        _team: &Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::LocalMemRegion(smr))
     }
@@ -1037,7 +1037,7 @@ impl<T: Dist> TeamTryFrom<OneSidedMemoryRegion<T>> for LamellarArrayRdmaInput<T>
 impl<T: Dist> TeamTryFrom<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
     fn team_try_from(
         smr: &OneSidedMemoryRegion<T>,
-        _team: &Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaOutput::LocalMemRegion(smr.clone()))
     }
@@ -1046,7 +1046,7 @@ impl<T: Dist> TeamTryFrom<&OneSidedMemoryRegion<T>> for LamellarArrayRdmaOutput<
 impl<T: Dist> TeamTryFrom<OneSidedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
     fn team_try_from(
         smr: OneSidedMemoryRegion<T>,
-        _team: &Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaOutput::LocalMemRegion(smr))
     }
diff --git a/src/memregion/prelude.rs b/src/memregion/prelude.rs
index a941b3ab..3ec4a88a 100644
--- a/src/memregion/prelude.rs
+++ b/src/memregion/prelude.rs
@@ -6,7 +6,6 @@ pub use crate::memregion::{
 pub use crate::active_messaging::ActiveMessaging;
 pub use crate::lamellar_team::LamellarTeam;
 //#[doc(hidden)]
-pub use crate::lamellar_team::LamellarTeamRT;
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
 pub use crate::LamellarEnv;
diff --git a/src/memregion/shared.rs b/src/memregion/shared.rs
index f43fd2d9..ee84bb74 100644
--- a/src/memregion/shared.rs
+++ b/src/memregion/shared.rs
@@ -107,7 +107,7 @@ impl<T: Dist> SharedMemoryRegion<T> {
                 team.async_barrier().await;
                 let mut mr_t = 
                     MemoryRegion::<T>::try_new(size, team.lamellae.clone(), alloc.clone());
-                while let Err(e) = mr_t {
+                while let Err(_e) = mr_t {
                     async_std::task::yield_now().await;
                     team.lamellae.alloc_pool(size * std::mem::size_of::<T>());
                     mr_t = MemoryRegion::try_new(size, team.lamellae.clone(), alloc.clone());
@@ -342,7 +342,7 @@ impl<T: Dist> From<&SharedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
 }
 
 impl<T: Dist> TeamFrom<&SharedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
-    fn team_from(smr: &SharedMemoryRegion<T>, _team: &std::pin::Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(smr: &SharedMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaOutput::SharedMemRegion(smr.clone())
     }
 }
@@ -355,7 +355,7 @@ impl<T: Dist> From<&SharedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
 }
 
 impl<T: Dist> TeamFrom<&SharedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
-    fn team_from(smr: &SharedMemoryRegion<T>, _team: &std::pin::Pin<Arc<LamellarTeamRT>>) -> Self {
+    fn team_from(smr: &SharedMemoryRegion<T>, _team: &Arc<LamellarTeam>) -> Self {
         LamellarArrayRdmaInput::SharedMemRegion(smr.clone())
     }
 }
@@ -363,7 +363,7 @@ impl<T: Dist> TeamFrom<&SharedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
 impl<T: Dist> TeamTryFrom<&SharedMemoryRegion<T>> for LamellarArrayRdmaOutput<T> {
     fn team_try_from(
         smr: &SharedMemoryRegion<T>,
-        _team: &std::pin::Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaOutput::SharedMemRegion(smr.clone()))
     }
@@ -372,7 +372,7 @@ impl<T: Dist> TeamTryFrom<&SharedMemoryRegion<T>> for LamellarArrayRdmaOutput<T>
 impl<T: Dist> TeamTryFrom<&SharedMemoryRegion<T>> for LamellarArrayRdmaInput<T> {
     fn team_try_from(
         smr: &SharedMemoryRegion<T>,
-        _team: &std::pin::Pin<Arc<LamellarTeamRT>>,
+        _team: &Arc<LamellarTeam>,
     ) -> Result<Self, anyhow::Error> {
         Ok(LamellarArrayRdmaInput::SharedMemRegion(smr.clone()))
     }
diff --git a/src/warnings.rs b/src/warnings.rs
index 76627492..06e3ca8e 100644
--- a/src/warnings.rs
+++ b/src/warnings.rs
@@ -51,6 +51,7 @@ impl<'a> RuntimeWarning<'a> {
         }
     }
 
+    #[cfg(feature = "runtime-warnings-panic")]
     fn panic(&self, msg: &str) {
         match self {
             RuntimeWarning::BarrierTimeout(_) => {}

From 74e736027af12e29db28702502ca931868582dc0 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Tue, 12 Nov 2024 22:41:05 -0800
Subject: [PATCH 109/116] formatting

---
 examples/array_examples/array_ops.rs          |  12 +-
 examples/array_examples/array_put_get.rs      |  11 +-
 .../array_examples/atomic_compare_exchange.rs |   3 +-
 examples/bandwidths/am_bw_get.rs              |  16 +-
 examples/bandwidths/get_bw.rs                 |   4 +-
 examples/bandwidths/put_bw.rs                 |   8 +-
 examples/bandwidths/unsafe_array_get_bw.rs    |   3 +-
 .../unsafe_array_get_unchecked_bw.rs          |   3 +-
 examples/bandwidths/unsafe_array_put_bw.rs    |   3 +-
 .../unsafe_array_put_unchecked_bw.rs          |   3 +-
 examples/bandwidths/unsafe_array_store_bw.rs  |   7 +-
 examples/kernels/cached_am_gemm.rs            |  17 +-
 examples/rdma_examples/rdma_am.rs             |   9 +-
 examples/rdma_examples/rdma_get.rs            |   5 +-
 src/active_messaging/prelude.rs               |   2 +-
 src/array.rs                                  |  11 +-
 src/array/atomic.rs                           |   2 -
 src/array/atomic/iteration.rs                 |   2 +-
 src/array/global_lock_atomic.rs               |   5 +-
 src/array/iterator/consumer.rs                |   1 -
 .../distributed_iterator/consumer/collect.rs  |   2 +-
 .../iterator/distributed_iterator/skip.rs     |   6 +-
 .../iterator/distributed_iterator/step_by.rs  |   6 +-
 .../local_iterator/consumer/collect.rs        |   2 +-
 src/array/iterator/local_iterator/map.rs      |   8 +-
 .../iterator/one_sided_iterator/chunks.rs     |   3 +-
 src/array/local_lock_atomic.rs                |   4 +-
 src/array/native_atomic.rs                    |   2 -
 src/array/native_atomic/handle.rs             |   1 -
 src/array/operations/shift.rs                 |   1 -
 src/array/prelude.rs                          |  14 +-
 src/array/read_only.rs                        |   2 -
 src/array/unsafe/iteration/distributed.rs     |   2 +-
 src/array/unsafe/iteration/local.rs           |   6 +-
 src/lamellar_team.rs                          |  15 +-
 src/lamellar_world.rs                         |  16 +-
 src/lib.rs                                    |   4 +-
 src/memregion.rs                              |   9 +-
 src/memregion/handle.rs                       |   5 +-
 src/memregion/shared.rs                       |   7 +-
 tests/array/arithmetic_ops/div_test.rs        | 154 ++++----
 tests/array/arithmetic_ops/fetch_div_test.rs  | 206 ++++++-----
 tests/array/arithmetic_ops/fetch_mul_test.rs  | 203 ++++++-----
 tests/array/arithmetic_ops/fetch_sub_test.rs  | 337 +++++++++---------
 tests/array/arithmetic_ops/mul_test.rs        | 152 ++++----
 tests/array/arithmetic_ops/sub_test.rs        | 277 +++++++-------
 tests/array/atomic_ops/swap_test.rs           | 227 ++++++------
 47 files changed, 898 insertions(+), 900 deletions(-)

diff --git a/examples/array_examples/array_ops.rs b/examples/array_examples/array_ops.rs
index 1ce43af2..a3b2328a 100644
--- a/examples/array_examples/array_ops.rs
+++ b/examples/array_examples/array_ops.rs
@@ -460,11 +460,15 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let num_pes = world.num_pes();
     let my_pe = world.my_pe();
-    let array_f64 = AtomicArray::<f64>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic, non bitwise
+    let array_f64 =
+        AtomicArray::<f64>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic, non bitwise
     let array_u8 = AtomicArray::<u8>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //intrinsic atomic,  bitwise
-    let array_i128 = AtomicArray::<i128>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic,  bitwise
-    let array_custom = AtomicArray::<Custom>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic, non bitwise
-    let _array_bool = AtomicArray::<bool>::new(world.clone(), num_pes * 10, Distribution::Block).block();
+    let array_i128 =
+        AtomicArray::<i128>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic,  bitwise
+    let array_custom =
+        AtomicArray::<Custom>::new(world.clone(), num_pes * 10, Distribution::Block).block(); //non intrinsic atomic, non bitwise
+    let _array_bool =
+        AtomicArray::<bool>::new(world.clone(), num_pes * 10, Distribution::Block).block();
 
     println!("ADD-----------------------");
     test_add(array_f64.clone(), 0.0, 1.0);
diff --git a/examples/array_examples/array_put_get.rs b/examples/array_examples/array_put_get.rs
index b76ff8b2..6d774eaf 100644
--- a/examples/array_examples/array_put_get.rs
+++ b/examples/array_examples/array_put_get.rs
@@ -31,15 +31,8 @@ fn main() {
             UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Block).await;
         let cyclic_array =
             UnsafeArray::<usize>::new(world.team(), total_len, Distribution::Cyclic).await;
-        let shared_mem_region = world
-            .alloc_shared_mem_region(total_len)
-            .await
-            
-            .into(); //Convert into abstract LamellarMemoryRegion
-        let local_mem_region = world
-            .alloc_one_sided_mem_region(total_len)
-            
-            .into();
+        let shared_mem_region = world.alloc_shared_mem_region(total_len).await.into(); //Convert into abstract LamellarMemoryRegion
+        let local_mem_region = world.alloc_one_sided_mem_region(total_len).into();
         initialize_array(&block_array).await;
         initialize_array(&cyclic_array).await;
         initialize_mem_region(&shared_mem_region);
diff --git a/examples/array_examples/atomic_compare_exchange.rs b/examples/array_examples/atomic_compare_exchange.rs
index 0aab9b41..45e9d4e0 100644
--- a/examples/array_examples/atomic_compare_exchange.rs
+++ b/examples/array_examples/atomic_compare_exchange.rs
@@ -44,7 +44,8 @@ fn main() {
     array.barrier();
     array.print();
 
-    let array_2 = AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic).block();
+    let array_2 =
+        AtomicArray::<f32>::new(world.team(), num_pes * 100000, Distribution::Cyclic).block();
     array_2.dist_iter_mut().for_each(|x| x.store(0.0)).block();
     array_2.barrier();
 
diff --git a/examples/bandwidths/am_bw_get.rs b/examples/bandwidths/am_bw_get.rs
index aa26857e..4f7b24d0 100644
--- a/examples/bandwidths/am_bw_get.rs
+++ b/examples/bandwidths/am_bw_get.rs
@@ -76,12 +76,16 @@ fn main() {
         if my_pe == num_pes - 1 {
             for _j in (0..(2_u64.pow(exp))).step_by(num_bytes as usize) {
                 let sub_timer = Instant::now();
-                let _ = world.exec_am_pe(0,
-                    DataAM {
-                        array: array.clone(),
-                        index: 0 as usize,
-                        length: num_bytes as usize,
-                    },).spawn();
+                let _ = world
+                    .exec_am_pe(
+                        0,
+                        DataAM {
+                            array: array.clone(),
+                            index: 0 as usize,
+                            length: num_bytes as usize,
+                        },
+                    )
+                    .spawn();
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
                 cnt += 1;
diff --git a/examples/bandwidths/get_bw.rs b/examples/bandwidths/get_bw.rs
index 2cd427b7..bc666633 100644
--- a/examples/bandwidths/get_bw.rs
+++ b/examples/bandwidths/get_bw.rs
@@ -12,9 +12,7 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let mem_reg = world
-        .alloc_shared_mem_region::<u8>(MEMREG_LEN)
-        .block();
+    let mem_reg = world.alloc_shared_mem_region::<u8>(MEMREG_LEN).block();
     let data = world.alloc_one_sided_mem_region::<u8>(MEMREG_LEN);
     for j in 0..MEMREG_LEN as usize {
         unsafe {
diff --git a/examples/bandwidths/put_bw.rs b/examples/bandwidths/put_bw.rs
index 8cc2bed2..0f1b05aa 100644
--- a/examples/bandwidths/put_bw.rs
+++ b/examples/bandwidths/put_bw.rs
@@ -11,12 +11,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world
-        .alloc_shared_mem_region::<u8>(ARRAY_LEN)
-        .block();
-    let data = world
-        .alloc_one_sided_mem_region::<u8>(ARRAY_LEN)
-        ;
+    let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN).block();
+    let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
             *i = my_pe as u8;
diff --git a/examples/bandwidths/unsafe_array_get_bw.rs b/examples/bandwidths/unsafe_array_get_bw.rs
index e925dcc0..846862d3 100644
--- a/examples/bandwidths/unsafe_array_get_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_bw.rs
@@ -13,7 +13,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let array: UnsafeArray<u8> =
+        UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
     let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
diff --git a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
index cacf9910..84a52b36 100644
--- a/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_get_unchecked_bw.rs
@@ -12,7 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let array: UnsafeArray<u8> =
+        UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
     let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
diff --git a/examples/bandwidths/unsafe_array_put_bw.rs b/examples/bandwidths/unsafe_array_put_bw.rs
index 97fc60b3..13aac3b6 100644
--- a/examples/bandwidths/unsafe_array_put_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_bw.rs
@@ -12,7 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let array: UnsafeArray<u8> =
+        UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
     let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
diff --git a/examples/bandwidths/unsafe_array_put_unchecked_bw.rs b/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
index 20735c8b..82516ef3 100644
--- a/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
+++ b/examples/bandwidths/unsafe_array_put_unchecked_bw.rs
@@ -12,7 +12,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let array: UnsafeArray<u8> =
+        UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
     let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
diff --git a/examples/bandwidths/unsafe_array_store_bw.rs b/examples/bandwidths/unsafe_array_store_bw.rs
index 0316c4fc..1e643243 100644
--- a/examples/bandwidths/unsafe_array_store_bw.rs
+++ b/examples/bandwidths/unsafe_array_store_bw.rs
@@ -13,7 +13,8 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array: UnsafeArray<u8> = UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
+    let array: UnsafeArray<u8> =
+        UnsafeArray::new(&world, ARRAY_LEN * num_pes, Distribution::Block).block();
     let data = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in data.as_mut_slice().unwrap() {
@@ -58,7 +59,9 @@ fn main() {
 
                 // array.get(ARRAY_LEN * (num_pes - 1), &sub_reg);
                 let _ = unsafe {
-                    array.batch_store(ARRAY_LEN * (num_pes - 1), sub_reg.as_slice().unwrap()).spawn()
+                    array
+                        .batch_store(ARRAY_LEN * (num_pes - 1), sub_reg.as_slice().unwrap())
+                        .spawn()
                 };
                 sub_time += sub_timer.elapsed().as_secs_f64();
                 sum += num_bytes * 1 as u64;
diff --git a/examples/kernels/cached_am_gemm.rs b/examples/kernels/cached_am_gemm.rs
index f784e82c..7f5ce88b 100644
--- a/examples/kernels/cached_am_gemm.rs
+++ b/examples/kernels/cached_am_gemm.rs
@@ -104,8 +104,7 @@ struct MatMulAM {
 impl LamellarAM for MatMulAM {
     async fn exec() {
         let b = lamellar::world
-            .alloc_one_sided_mem_region::<f32>(self.b.block_size * self.b.block_size)
-            ;
+            .alloc_one_sided_mem_region::<f32>(self.b.block_size * self.b.block_size);
         get_sub_mat(&self.b, &b).await;
         // we dont actually want to alloc a shared memory region as there is an implicit barrier here
         // introduces sync point and potential for deadlock
@@ -120,9 +119,8 @@ impl LamellarAM for MatMulAM {
             a.row_block = row;
             let mut c = self.c.clone();
             c.row_block = row;
-            let sub_a = lamellar::world
-                .alloc_one_sided_mem_region::<f32>(a.block_size * a.block_size)
-                ;
+            let sub_a =
+                lamellar::world.alloc_one_sided_mem_region::<f32>(a.block_size * a.block_size);
             get_sub_mat(&a, &sub_a).await; //this should be local copy so returns immediately
             do_gemm(&sub_a, &b, c, self.block_size);
         }
@@ -178,16 +176,13 @@ fn main() {
 
     let a = world
         .alloc_shared_mem_region::<f32>((m * n) / num_pes)
-        .block()
-        ;
+        .block();
     let b = world
         .alloc_shared_mem_region::<f32>((n * p) / num_pes)
-        .block()
-        ;
+        .block();
     let c = world
         .alloc_shared_mem_region::<f32>((m * p) / num_pes)
-        .block()
-        ;
+        .block();
     // let c2 = world.alloc_shared_mem_region::<f32>((m * p) / num_pes);
     unsafe {
         let mut cnt = my_pe as f32 * ((m * n) / num_pes) as f32;
diff --git a/examples/rdma_examples/rdma_am.rs b/examples/rdma_examples/rdma_am.rs
index e35159f8..0da7201f 100644
--- a/examples/rdma_examples/rdma_am.rs
+++ b/examples/rdma_examples/rdma_am.rs
@@ -32,9 +32,7 @@ impl LamellarAM for RdmaAM {
         }
 
         //get the original nodes data
-        let local = lamellar::world
-            .alloc_one_sided_mem_region::<u8>(ARRAY_LEN)
-            ;
+        let local = lamellar::world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
         let local_slice = unsafe { local.as_mut_slice().unwrap() };
         local_slice[ARRAY_LEN - 1] = lamellar::num_pes as u8;
         unsafe {
@@ -107,10 +105,7 @@ fn main() {
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let array = world
-        .alloc_shared_mem_region::<u8>(ARRAY_LEN)
-        .block()
-        ;
+    let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN).block();
     let local_array = world.alloc_one_sided_mem_region::<u8>(ARRAY_LEN);
     unsafe {
         for i in array.as_mut_slice().unwrap() {
diff --git a/examples/rdma_examples/rdma_get.rs b/examples/rdma_examples/rdma_get.rs
index 17ced9e1..a9d80169 100644
--- a/examples/rdma_examples/rdma_get.rs
+++ b/examples/rdma_examples/rdma_get.rs
@@ -19,10 +19,7 @@ fn main() {
     if num_pes > 1 {
         // instatiates a shared memory region on every PE in world
         // all other pes can put/get into this region
-        let array = world
-            .alloc_shared_mem_region::<u8>(ARRAY_LEN)
-            .block()
-            ;
+        let array = world.alloc_shared_mem_region::<u8>(ARRAY_LEN).block();
         let array_slice = unsafe { array.as_slice().unwrap() }; //we can unwrap because we know array is local
                                                                 // instatiates a local array whos memory is registered with
                                                                 // the underlying network device, so that it can be used
diff --git a/src/active_messaging/prelude.rs b/src/active_messaging/prelude.rs
index 6cfb237f..0c55163a 100644
--- a/src/active_messaging/prelude.rs
+++ b/src/active_messaging/prelude.rs
@@ -15,7 +15,7 @@ pub use crate::inventory;
 pub use crate::lamellar_arch::*;
 pub use crate::lamellar_team::LamellarTeam;
 //#[doc(hidden)]
-pub use crate::lamellar_team::{IntoLamellarTeam};
+pub use crate::lamellar_team::IntoLamellarTeam;
 pub use crate::lamellar_world::LamellarWorld;
 pub use crate::lamellar_world::LamellarWorldBuilder;
 pub use crate::LamellarEnv;
diff --git a/src/array.rs b/src/array.rs
index ffa71295..3b5a0bb9 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -1036,7 +1036,6 @@ impl<T: Dist> LamellarEnv for LamellarWriteArray<T> {
     }
 }
 
-
 // private sealed trait
 #[doc(hidden)]
 pub trait InnerArray: Sized {
@@ -1076,7 +1075,8 @@ pub(crate) mod private {
         where
             F: LamellarActiveMessage + LocalAM + 'static,
         {
-            self.team_rt().exec_am_local_tg(am, Some(self.team_counters()))
+            self.team_rt()
+                .exec_am_local_tg(am, Some(self.team_counters()))
         }
         fn exec_am_pe_tg<F>(&self, pe: usize, am: F) -> AmHandle<F::Output>
         where
@@ -1103,7 +1103,8 @@ pub(crate) mod private {
         where
             F: RemoteActiveMessage + LamellarAM + AmDist,
         {
-            self.team_rt().exec_am_all_tg(am, Some(self.team_counters()))
+            self.team_rt()
+                .exec_am_all_tg(am, Some(self.team_counters()))
         }
     }
 }
@@ -1111,7 +1112,9 @@ pub(crate) mod private {
 /// Represents a distributed array, providing some convenience functions for getting simple information about the array.
 /// This is mostly intended for use within the runtime (specifically for use in Proc Macros) but the available functions may be useful to endusers as well.
 #[enum_dispatch(LamellarReadArray<T>,LamellarWriteArray<T>)]
-pub trait LamellarArray<T: Dist>: private::LamellarArrayPrivate<T> + ActiveMessaging + LamellarEnv {
+pub trait LamellarArray<T: Dist>:
+    private::LamellarArrayPrivate<T> + ActiveMessaging + LamellarEnv
+{
     // #[doc(alias("One-sided", "onesided"))]
     /// Returns the team used to construct this array, the PEs in the team represent the same PEs which have a slice of data of the array
     ///
diff --git a/src/array/atomic.rs b/src/array/atomic.rs
index 9aa3041a..02b5511f 100644
--- a/src/array/atomic.rs
+++ b/src/array/atomic.rs
@@ -1166,8 +1166,6 @@ impl<T: Dist> AtomicArray<T> {
     }
 }
 
-
-
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for AtomicArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
diff --git a/src/array/atomic/iteration.rs b/src/array/atomic/iteration.rs
index b9ab104b..1bc7b463 100644
--- a/src/array/atomic/iteration.rs
+++ b/src/array/atomic/iteration.rs
@@ -7,9 +7,9 @@ use crate::array::iterator::{
     private::{InnerIter, Sealed},
     LamellarArrayIterators, LamellarArrayMutIterators,
 };
+use crate::array::private::ArrayExecAm;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::*;
-use crate::array::private::ArrayExecAm;
 use crate::memregion::Dist;
 
 use self::iterator::IterLockFuture;
diff --git a/src/array/global_lock_atomic.rs b/src/array/global_lock_atomic.rs
index ad18e847..fba19706 100644
--- a/src/array/global_lock_atomic.rs
+++ b/src/array/global_lock_atomic.rs
@@ -734,7 +734,9 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
         // println!("GlobalLock from unsafe");
         array.await_on_outstanding(DarcMode::GlobalLockArray).await;
-        let lock = GlobalRwDarc::new(array.team_rt(), ()).await.expect("PE in team");
+        let lock = GlobalRwDarc::new(array.team_rt(), ())
+            .await
+            .expect("PE in team");
 
         GlobalLockArray {
             lock: lock,
@@ -743,7 +745,6 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for GlobalLockArray<T> {
     }
 }
 
-
 impl<T: Dist> From<GlobalLockArray<T>> for GlobalLockByteArray {
     fn from(array: GlobalLockArray<T>) -> Self {
         GlobalLockByteArray {
diff --git a/src/array/iterator/consumer.rs b/src/array/iterator/consumer.rs
index cd11f599..f5b25997 100644
--- a/src/array/iterator/consumer.rs
+++ b/src/array/iterator/consumer.rs
@@ -14,7 +14,6 @@ use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
-
 // trait Consumer{
 //     type Item;
 //     fn init(&self, start: usize, cnt: usize, _s: Sealed) -> Self;
diff --git a/src/array/iterator/distributed_iterator/consumer/collect.rs b/src/array/iterator/distributed_iterator/consumer/collect.rs
index 3e97f68d..bf6420a5 100644
--- a/src/array/iterator/distributed_iterator/consumer/collect.rs
+++ b/src/array/iterator/distributed_iterator/consumer/collect.rs
@@ -6,13 +6,13 @@ use crate::array::operations::ArrayOps;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution};
 use crate::barrier::BarrierHandle;
+use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
-use crate::lamellar_env::LamellarEnv;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
diff --git a/src/array/iterator/distributed_iterator/skip.rs b/src/array/iterator/distributed_iterator/skip.rs
index ca3aff9c..8368b3a9 100644
--- a/src/array/iterator/distributed_iterator/skip.rs
+++ b/src/array/iterator/distributed_iterator/skip.rs
@@ -9,9 +9,9 @@ pub struct Skip<I> {
 }
 
 impl<I: InnerIter> InnerIter for Skip<I> {
-fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
-            None
-        }
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
     fn iter_clone(&self, _s: Sealed) -> Self {
         Skip {
             iter: self.iter.iter_clone(Sealed),
diff --git a/src/array/iterator/distributed_iterator/step_by.rs b/src/array/iterator/distributed_iterator/step_by.rs
index 4692bb47..d7b928ab 100644
--- a/src/array/iterator/distributed_iterator/step_by.rs
+++ b/src/array/iterator/distributed_iterator/step_by.rs
@@ -9,9 +9,9 @@ pub struct StepBy<I> {
 }
 
 impl<I: InnerIter> InnerIter for StepBy<I> {
-fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
-            None
-        }
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
     fn iter_clone(&self, _s: Sealed) -> Self {
         StepBy {
             iter: self.iter.iter_clone(Sealed),
diff --git a/src/array/iterator/local_iterator/consumer/collect.rs b/src/array/iterator/local_iterator/consumer/collect.rs
index f060533e..85c78b35 100644
--- a/src/array/iterator/local_iterator/consumer/collect.rs
+++ b/src/array/iterator/local_iterator/consumer/collect.rs
@@ -5,13 +5,13 @@ use crate::array::iterator::{consumer::*, IterLockFuture};
 use crate::array::operations::ArrayOps;
 use crate::array::r#unsafe::private::UnsafeArrayInner;
 use crate::array::{AsyncTeamFrom, AsyncTeamInto, Distribution};
+use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_request::LamellarRequest;
 use crate::lamellar_task_group::TaskGroupLocalAmHandle;
 use crate::lamellar_team::LamellarTeamRT;
 use crate::memregion::Dist;
 use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
-use crate::lamellar_env::LamellarEnv;
 
 use core::marker::PhantomData;
 use futures_util::{ready, Future};
diff --git a/src/array/iterator/local_iterator/map.rs b/src/array/iterator/local_iterator/map.rs
index 079ba86b..62ff4302 100644
--- a/src/array/iterator/local_iterator/map.rs
+++ b/src/array/iterator/local_iterator/map.rs
@@ -7,9 +7,9 @@ pub struct Map<I, F> {
 }
 
 impl<I: InnerIter, F: Clone> InnerIter for Map<I, F> {
-fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
-            None
-        }
+    fn lock_if_needed(&self, _s: Sealed) -> Option<IterLockFuture> {
+        None
+    }
     fn iter_clone(&self, _s: Sealed) -> Self {
         Map {
             iter: self.iter.iter_clone(Sealed),
@@ -36,7 +36,7 @@ where
     type Item = B;
     type Array = <I as LocalIterator>::Array;
     fn init(&self, start_i: usize, cnt: usize, _s: Sealed) -> Map<I, F> {
-        Map::new(self.iter.init(start_i, cnt,_s), self.f.clone())
+        Map::new(self.iter.init(start_i, cnt, _s), self.f.clone())
     }
     fn array(&self) -> Self::Array {
         self.iter.array()
diff --git a/src/array/iterator/one_sided_iterator/chunks.rs b/src/array/iterator/one_sided_iterator/chunks.rs
index 4ae204f6..78806725 100644
--- a/src/array/iterator/one_sided_iterator/chunks.rs
+++ b/src/array/iterator/one_sided_iterator/chunks.rs
@@ -1,12 +1,11 @@
 use crate::array::iterator::one_sided_iterator::{private::*, *};
 use crate::array::ArrayRdmaHandle;
+use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_request::LamellarRequest;
 use crate::memregion::OneSidedMemoryRegion;
-use crate::lamellar_env::LamellarEnv;
 
 use pin_project::pin_project;
 
-
 #[pin_project]
 pub struct Chunks<I>
 where
diff --git a/src/array/local_lock_atomic.rs b/src/array/local_lock_atomic.rs
index 56fedf5a..eb362f5e 100644
--- a/src/array/local_lock_atomic.rs
+++ b/src/array/local_lock_atomic.rs
@@ -702,7 +702,9 @@ impl<T: Dist> AsyncFrom<UnsafeArray<T>> for LocalLockArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
         // println!("locallock from unsafe");
         array.await_on_outstanding(DarcMode::LocalLockArray).await;
-        let lock = LocalRwDarc::new(array.team_rt(), ()).await.expect("PE in team");
+        let lock = LocalRwDarc::new(array.team_rt(), ())
+            .await
+            .expect("PE in team");
 
         LocalLockArray {
             lock: lock,
diff --git a/src/array/native_atomic.rs b/src/array/native_atomic.rs
index 2080e8ea..ea4d68c7 100644
--- a/src/array/native_atomic.rs
+++ b/src/array/native_atomic.rs
@@ -1018,7 +1018,6 @@ impl<T: Dist> NativeAtomicArray<T> {
     }
 }
 
-
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
         let array: UnsafeArray<T> = AsyncTeamInto::team_into(input, team).await;
@@ -1026,7 +1025,6 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for NativeAtomicA
     }
 }
 
-
 //#[doc(hidden)]
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for NativeAtomicArray<T> {
diff --git a/src/array/native_atomic/handle.rs b/src/array/native_atomic/handle.rs
index 90b45225..edb8d63e 100644
--- a/src/array/native_atomic/handle.rs
+++ b/src/array/native_atomic/handle.rs
@@ -28,7 +28,6 @@ impl<T: Dist + ArrayOps + 'static> PinnedDrop for NativeAtomicArrayHandle<T> {
     }
 }
 
-
 impl<T: Dist + ArrayOps + 'static> Future for NativeAtomicArrayHandle<T> {
     type Output = NativeAtomicArray<T>;
     fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index 44945950..afe5e99e 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -351,7 +351,6 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     }
 }
 
-
 #[doc(alias("One-sided", "onesided"))]
 /// The interface for performing remote Shift operations on [UnsafeArray] elements
 ///
diff --git a/src/array/prelude.rs b/src/array/prelude.rs
index 0d401297..a8174d1c 100644
--- a/src/array/prelude.rs
+++ b/src/array/prelude.rs
@@ -1,10 +1,10 @@
-pub use crate::array::atomic::{AtomicArray,AtomicArrayHandle};
-pub use crate::array::generic_atomic::{GenericAtomicArray};
-pub use crate::array::global_lock_atomic::{GlobalLockArray,handle::GlobalLockArrayHandle};
-pub use crate::array::local_lock_atomic::{LocalLockArray,handle::LocalLockArrayHandle};
-pub use crate::array::native_atomic::{NativeAtomicArray};
-pub use crate::array::r#unsafe::{UnsafeArray,UnsafeArrayHandle};
-pub use crate::array::read_only::{ReadOnlyArray,ReadOnlyArrayHandle};
+pub use crate::array::atomic::{AtomicArray, AtomicArrayHandle};
+pub use crate::array::generic_atomic::GenericAtomicArray;
+pub use crate::array::global_lock_atomic::{handle::GlobalLockArrayHandle, GlobalLockArray};
+pub use crate::array::local_lock_atomic::{handle::LocalLockArrayHandle, LocalLockArray};
+pub use crate::array::native_atomic::NativeAtomicArray;
+pub use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayHandle};
+pub use crate::array::read_only::{ReadOnlyArray, ReadOnlyArrayHandle};
 //#[doc(hidden)]
 pub use crate::array::{
     register_reduction,
diff --git a/src/array/read_only.rs b/src/array/read_only.rs
index b6b1e90e..4e624070 100644
--- a/src/array/read_only.rs
+++ b/src/array/read_only.rs
@@ -345,7 +345,6 @@ impl<T: Dist + 'static> ReadOnlyArray<T> {
     }
 }
 
-
 // #[async_trait]
 impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray<T> {
     async fn team_from(input: (Vec<T>, Distribution), team: &Arc<LamellarTeam>) -> Self {
@@ -354,7 +353,6 @@ impl<T: Dist + ArrayOps> AsyncTeamFrom<(Vec<T>, Distribution)> for ReadOnlyArray
     }
 }
 
-
 #[async_trait]
 impl<T: Dist> AsyncFrom<UnsafeArray<T>> for ReadOnlyArray<T> {
     async fn async_from(array: UnsafeArray<T>) -> Self {
diff --git a/src/array/unsafe/iteration/distributed.rs b/src/array/unsafe/iteration/distributed.rs
index 6ac4c458..5a4e9be8 100644
--- a/src/array/unsafe/iteration/distributed.rs
+++ b/src/array/unsafe/iteration/distributed.rs
@@ -4,9 +4,9 @@ use crate::array::iterator::private::Sealed;
 use crate::array::iterator::Schedule;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution, InnerArray};
+use crate::lamellar_env::LamellarEnv;
 use crate::memregion::Dist;
 use crate::LamellarTeam;
-use crate::lamellar_env::LamellarEnv;
 
 use core::marker::PhantomData;
 use futures_util::Future;
diff --git a/src/array/unsafe/iteration/local.rs b/src/array/unsafe/iteration/local.rs
index bccd3b83..eb1a1732 100644
--- a/src/array/unsafe/iteration/local.rs
+++ b/src/array/unsafe/iteration/local.rs
@@ -1,12 +1,12 @@
 use crate::active_messaging::SyncSend;
 use crate::array::iterator::local_iterator::*;
 use crate::array::iterator::private::*;
+use crate::array::iterator::Schedule;
 use crate::array::r#unsafe::{UnsafeArray, UnsafeArrayInner};
 use crate::array::{ArrayOps, AsyncTeamFrom, Distribution};
-use crate::LamellarTeam;
-use crate::array::iterator::Schedule;
-use crate::memregion::Dist;
 use crate::lamellar_env::LamellarEnv;
+use crate::memregion::Dist;
+use crate::LamellarTeam;
 
 use core::marker::PhantomData;
 use futures_util::Future;
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 09938528..23c4b3d0 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1065,7 +1065,7 @@ impl LamellarTeamRT {
         self.arch.team_iter().collect::<Vec<usize>>()
     }
 
-     pub(crate) fn team_pe_id(&self) -> Result<usize, IdError> {
+    pub(crate) fn team_pe_id(&self) -> Result<usize, IdError> {
         self.arch.team_pe(self.world_pe)
     }
 
@@ -1356,7 +1356,7 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-     pub(crate) fn print_arch(&self) {
+    pub(crate) fn print_arch(&self) {
         println!("-----mapping of team pe ids to parent pe ids-----");
         let mut parent = format!("");
         let mut team = format!("");
@@ -1710,7 +1710,11 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn exec_am_pe<F>(self: &Pin<Arc<LamellarTeamRT>>, pe: usize, am: F) -> AmHandle<F::Output>
+    pub(crate) fn exec_am_pe<F>(
+        self: &Pin<Arc<LamellarTeamRT>>,
+        pe: usize,
+        am: F,
+    ) -> AmHandle<F::Output>
     where
         F: RemoteActiveMessage + LamellarAM + AmDist,
     {
@@ -2120,7 +2124,10 @@ impl LamellarTeamRT {
     }
 
     //#[tracing::instrument(skip_all)]
-    pub(crate) fn exec_am_local<F>(self: &Pin<Arc<LamellarTeamRT>>, am: F) -> LocalAmHandle<F::Output>
+    pub(crate) fn exec_am_local<F>(
+        self: &Pin<Arc<LamellarTeamRT>>,
+        am: F,
+    ) -> LocalAmHandle<F::Output>
     where
         F: LamellarActiveMessage + LocalAM + 'static,
     {
diff --git a/src/lamellar_world.rs b/src/lamellar_world.rs
index 81c2866b..0db3554d 100644
--- a/src/lamellar_world.rs
+++ b/src/lamellar_world.rs
@@ -3,10 +3,8 @@ use crate::lamellae::{create_lamellae, Backend, Lamellae, LamellaeComm, Lamellae
 use crate::lamellar_arch::LamellarArch;
 use crate::lamellar_env::LamellarEnv;
 use crate::lamellar_team::{LamellarTeam, LamellarTeamRT};
-use crate::memregion::handle::{FallibleSharedMemoryRegionHandle,SharedMemoryRegionHandle};
-use crate::memregion::{
-    one_sided::OneSidedMemoryRegion,  Dist, RemoteMemoryRegion,
-};
+use crate::memregion::handle::{FallibleSharedMemoryRegionHandle, SharedMemoryRegionHandle};
+use crate::memregion::{one_sided::OneSidedMemoryRegion, Dist, RemoteMemoryRegion};
 use crate::scheduler::{create_scheduler, ExecutorType, LamellarTask};
 use crate::{active_messaging::*, config};
 // use log::trace;
@@ -134,7 +132,10 @@ impl ActiveMessaging for LamellarWorld {
 
 impl RemoteMemoryRegion for LamellarWorld {
     //#[tracing::instrument(skip_all)]
-    fn try_alloc_shared_mem_region<T: Dist>(&self, size: usize) -> FallibleSharedMemoryRegionHandle<T> {
+    fn try_alloc_shared_mem_region<T: Dist>(
+        &self,
+        size: usize,
+    ) -> FallibleSharedMemoryRegionHandle<T> {
         self.team.try_alloc_shared_mem_region::<T>(size)
     }
 
@@ -152,10 +153,7 @@ impl RemoteMemoryRegion for LamellarWorld {
     }
 
     //#[tracing::instrument(skip_all)]
-    fn alloc_one_sided_mem_region<T: Dist>(
-        &self,
-        size: usize,
-    ) -> OneSidedMemoryRegion<T> {
+    fn alloc_one_sided_mem_region<T: Dist>(&self, size: usize) -> OneSidedMemoryRegion<T> {
         self.team.alloc_one_sided_mem_region::<T>(size)
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 179e65c4..e07ded3d 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -286,8 +286,8 @@ pub use crate::lamellar_task_group::{
 };
 pub use crate::lamellar_team::LamellarTeam;
 // //#[doc(hidden)]
-pub use crate::lamellar_team::{ArcLamellarTeam}; 
-pub(crate) use crate::lamellar_team::{LamellarTeamRT};
+pub use crate::lamellar_team::ArcLamellarTeam;
+pub(crate) use crate::lamellar_team::LamellarTeamRT;
 pub use crate::lamellar_world::*;
 pub use crate::scheduler::ExecutorType;
 
diff --git a/src/memregion.rs b/src/memregion.rs
index 9faa1f23..c86a4d00 100644
--- a/src/memregion.rs
+++ b/src/memregion.rs
@@ -1214,12 +1214,12 @@ pub trait RemoteMemoryRegion {
     #[doc(alias = "Collective")]
     /// Allocate a shared memory region from the asymmetric heap.
     /// There will be `size` number of `T` elements on each PE.
-    /// 
+    ///
     /// Note: If there is not enough memory in the lamellar heap on the calling PE
     /// this call will trigger a "heap grow" operation (initiated and handled by the runtime),
     /// this behavior can be disabled by setting the env variable "LAMELLAR_HEAP_MODE=static",
     /// in which case this call will cause a panic if there is not enough memory.
-    /// 
+    ///
     /// Alternatively, you can use the `try_alloc_shared_mem_region` method which returns
     /// a `Result` and allows you to handle the error case when there is not enough memory.
     ///
@@ -1243,18 +1243,17 @@ pub trait RemoteMemoryRegion {
         size: usize,
     ) -> FallibleSharedMemoryRegionHandle<T>;
 
-
     #[doc(alias("One-sided", "onesided"))]
     /// Allocate a one-sided memory region from the internal lamellar heap.
     /// This region only exists on the calling PE, but the returned handle can be
     /// sent to other PEs allowing remote access to the region.
     /// There will be `size` number of `T` elements on the calling PE.
-    /// 
+    ///
     /// Note: If there is not enough memory in the lamellar heap on the calling PE
     /// this call will trigger a "heap grow" operation (initiated and handled by the runtime),
     /// this behavior can be disabled by setting the env variable "LAMELLAR_HEAP_MODE=static",
     /// in which case this call will cause a panic if there is not enough memory.
-    /// 
+    ///
     /// Alternatively, you can use the `try_alloc_one_sided_mem_region` method which returns
     /// a `Result` and allows you to handle the error case when there is not enough memory.
     ///
diff --git a/src/memregion/handle.rs b/src/memregion/handle.rs
index 4e7f4626..f7200393 100644
--- a/src/memregion/handle.rs
+++ b/src/memregion/handle.rs
@@ -7,7 +7,7 @@ use crate::scheduler::LamellarTask;
 use crate::warnings::RuntimeWarning;
 use crate::{Dist, LamellarTeamRT};
 
-use futures_util::{ Future};
+use futures_util::Future;
 use pin_project::{pin_project, pinned_drop};
 
 #[must_use = " SharedMemoryRegion 'new' handles do nothing unless polled or awaited, or 'spawn()' or 'block()' are called"]
@@ -114,8 +114,7 @@ pub struct SharedMemoryRegionHandle<T: Dist> {
     pub(crate) team: Pin<Arc<LamellarTeamRT>>,
     pub(crate) launched: bool,
     #[pin]
-    pub(crate) creation_future:
-        Pin<Box<dyn Future<Output = SharedMemoryRegion<T>> + Send>>,
+    pub(crate) creation_future: Pin<Box<dyn Future<Output = SharedMemoryRegion<T>> + Send>>,
 }
 
 #[pinned_drop]
diff --git a/src/memregion/shared.rs b/src/memregion/shared.rs
index ee84bb74..c7202ccc 100644
--- a/src/memregion/shared.rs
+++ b/src/memregion/shared.rs
@@ -105,7 +105,7 @@ impl<T: Dist> SharedMemoryRegion<T> {
             launched: false,
             creation_future: Box::pin(async move {
                 team.async_barrier().await;
-                let mut mr_t = 
+                let mut mr_t =
                     MemoryRegion::<T>::try_new(size, team.lamellae.clone(), alloc.clone());
                 while let Err(_e) = mr_t {
                     async_std::task::yield_now().await;
@@ -113,7 +113,10 @@ impl<T: Dist> SharedMemoryRegion<T> {
                     mr_t = MemoryRegion::try_new(size, team.lamellae.clone(), alloc.clone());
                 }
 
-                let mr = unsafe { mr_t.expect("enough memory should have been allocated").to_base::<u8>() };
+                let mr = unsafe {
+                    mr_t.expect("enough memory should have been allocated")
+                        .to_base::<u8>()
+                };
                 SharedMemoryRegion {
                     mr: Darc::async_try_new_with_drop(
                         team.clone(),
diff --git a/tests/array/arithmetic_ops/div_test.rs b/tests/array/arithmetic_ops/div_test.rs
index 66b28316..31807e9d 100644
--- a/tests/array/arithmetic_ops/div_test.rs
+++ b/tests/array/arithmetic_ops/div_test.rs
@@ -75,105 +75,103 @@ macro_rules! onesided_iter {
     };
 }
 
-macro_rules! div_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+macro_rules! div_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let max_updates = max_updates!($t,num_pes);
-            let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
-            let one = 1 as $t;
-            let init_val = max_val as $t;
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            for idx in 0..array.len(){
-                for _i in 0..(max_updates as usize){
-                    #[allow(unused_unsafe)]
-                    let _ =  unsafe{ array.div(idx,2 as $t).spawn()};
-                }
+        let max_updates = max_updates!($t, num_pes);
+        let max_val = 2u128.pow((max_updates * num_pes) as u32) as $t;
+        let one = 1 as $t;
+        let init_val = max_val as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        for idx in 0..array.len() {
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                let _ = unsafe { array.div(idx, 2 as $t).spawn() };
             }
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {onesided_iter!($array,array).into_iter().enumerate()}{
-                let val = *elem;
-                check_val!($array,val,one,success);
-                if !success{
-                   eprintln!("full {:?} {:?} {:?}",i,val,one);
-                }
+        }
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, one, success);
+            if !success {
+                eprintln!("full {:?} {:?} {:?}", i, val, one);
             }
+        }
 
-            array.barrier();
-            initialize_array!($array, array, init_val);
+        array.barrier();
+        initialize_array!($array, array, init_val);
 
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        // // sub_array.print();
+        for idx in 0..sub_array.len() {
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                let _ = unsafe { sub_array.div(idx, 2 as $t).spawn() };
+            }
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, one, success);
+            if !success {
+                eprintln!("half {:?} {:?} {:?}", i, val, one);
+            }
+        }
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
 
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
-            // // sub_array.print();
-            for idx in 0..sub_array.len(){
-                for _i in 0..(max_updates as usize){
+            for idx in 0..sub_array.len() {
+                for _i in 0..(max_updates as usize) {
                     #[allow(unused_unsafe)]
-                    let _ =  unsafe{ sub_array.div(idx,2 as $t).spawn()};
+                    let _ = unsafe { sub_array.div(idx, 2 as $t).spawn() };
                 }
             }
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
+            for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
                 let val = *elem;
-                check_val!($array,val,one,success);
-                if !success{
-                    eprintln!("half {:?} {:?} {:?}",i,val,one);
+                check_val!($array, val, one, success);
+                if !success {
+                    eprintln!("pe {:?} {:?} {:?}", i, val, one);
                 }
             }
             sub_array.barrier();
             initialize_array!($array, array, init_val);
+        }
 
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                for idx in 0..sub_array.len(){
-                    for _i in 0..(max_updates as usize){
-                        #[allow(unused_unsafe)]
-                        let _ =  unsafe{ sub_array.div(idx,2 as $t).spawn()};
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
-                    let val = *elem;
-                    check_val!($array,val,one,success);
-                    if !success{
-                        eprintln!("pe {:?} {:?} {:?}",i,val,one);
-                    }
-                }
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {
diff --git a/tests/array/arithmetic_ops/fetch_div_test.rs b/tests/array/arithmetic_ops/fetch_div_test.rs
index dd82db60..efbbfcef 100644
--- a/tests/array/arithmetic_ops/fetch_div_test.rs
+++ b/tests/array/arithmetic_ops/fetch_div_test.rs
@@ -105,77 +105,115 @@ macro_rules! onesided_iter {
     };
 }
 
-macro_rules! fetch_div_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+macro_rules! fetch_div_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let max_updates = max_updates!($t,num_pes);
-            let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
-            let one = 1 as $t;
-            let init_val = max_val as $t;
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            for idx in 0..array.len(){
-                let mut reqs = vec![];
-                for _i in 0..(max_updates as usize){
-                    #[allow(unused_unsafe)]
-                    reqs.push(unsafe{array.fetch_div(idx,2 as $t)});
-                }
-                #[allow(unused_mut)]
-                let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
-                        success = false;
-                        break;
-                    }
-                }
+        let max_updates = max_updates!($t, num_pes);
+        let max_val = 2u128.pow((max_updates * num_pes) as u32) as $t;
+        let one = 1 as $t;
+        let init_val = max_val as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        for idx in 0..array.len() {
+            let mut reqs = vec![];
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                reqs.push(unsafe { array.fetch_div(idx, 2 as $t) });
             }
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
-                let val = *elem;
-                check_val!($array,val,one,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,one);
+            #[allow(unused_mut)]
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("full 1: {:?} {:?} {:?}", init_val, val, prevs);
+                    success = false;
                     break;
                 }
             }
+        }
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, one, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, one);
+                break;
+            }
+        }
 
-            array.barrier();
-            initialize_array!($array, array, init_val);
+        array.barrier();
+        initialize_array!($array, array, init_val);
 
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        // // sub_array.print();
+        for idx in 0..sub_array.len() {
+            let mut reqs = vec![];
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                reqs.push(unsafe { sub_array.fetch_div(idx, 2 as $t) });
+            }
+            #[allow(unused_mut)]
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("half 1: {:?} {:?}", val, prevs);
+                    success = false;
+                    break;
+                }
+            }
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, one, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, one);
+                break;
+            }
+        }
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
 
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
-            // // sub_array.print();
-            for idx in 0..sub_array.len(){
+            for idx in 0..sub_array.len() {
                 let mut reqs = vec![];
-                for _i in 0..(max_updates as usize){
+                for _i in 0..(max_updates as usize) {
                     #[allow(unused_unsafe)]
-                    reqs.push(unsafe{sub_array.fetch_div(idx,2 as $t)});
+                    reqs.push(unsafe { sub_array.fetch_div(idx, 2 as $t) });
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("half 1: {:?} {:?}",val,prevs);
+                for req in reqs {
+                    let val = world.block_on(req) as u128;
+                    if !insert_prev!($array, val, prevs) {
+                        eprintln!("pe 1: {:?} {:?}", val, prevs);
                         success = false;
                         break;
                     }
@@ -184,62 +222,22 @@ macro_rules! fetch_div_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
+            for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
                 let val = *elem;
-                check_val!($array,val,one,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,one);
+                check_val!($array, val, one, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", i, val, one);
                     break;
                 }
             }
             sub_array.barrier();
             initialize_array!($array, array, init_val);
+        }
 
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                for idx in 0..sub_array.len(){
-                    let mut reqs = vec![];
-                    for _i in 0..(max_updates as usize){
-                        #[allow(unused_unsafe)]
-                        reqs.push(unsafe{sub_array.fetch_div(idx,2 as $t)});
-                    }
-                    #[allow(unused_mut)]
-                    let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                    for req in reqs{
-                        let val =  world.block_on(req) as u128;
-                        if ! insert_prev!($array,val,prevs){
-                            eprintln!("pe 1: {:?} {:?}",val,prevs);
-                            success = false;
-                            break;
-                        }
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                for (i,elem) in unsafe {onesided_iter!($array,sub_array).into_iter().enumerate()}{
-                    let val = *elem;
-                    check_val!($array,val,one,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",i,val,one);
-                        break;
-                    }
-                }
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {
diff --git a/tests/array/arithmetic_ops/fetch_mul_test.rs b/tests/array/arithmetic_ops/fetch_mul_test.rs
index 76f901d9..f8449a62 100644
--- a/tests/array/arithmetic_ops/fetch_mul_test.rs
+++ b/tests/array/arithmetic_ops/fetch_mul_test.rs
@@ -100,75 +100,112 @@ macro_rules! onesided_iter {
     };
 }
 
-macro_rules! fetch_mul_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+macro_rules! fetch_mul_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let max_updates = max_updates!($t,num_pes);
-            let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
-            let init_val = 1 as $t;
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            for idx in 0..array.len(){
-                let mut reqs = vec![];
-                for _i in 0..(max_updates as usize){
-                    #[allow(unused_unsafe)]
-                    reqs.push(unsafe{array.fetch_mul(idx,2 as $t)});
-                }
-                #[allow(unused_mut)]
-                let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
-                        success = false;
-                        break;
-                    }
-                }
+        let max_updates = max_updates!($t, num_pes);
+        let max_val = 2u128.pow((max_updates * num_pes) as u32) as $t;
+        let init_val = 1 as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        for idx in 0..array.len() {
+            let mut reqs = vec![];
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                reqs.push(unsafe { array.fetch_mul(idx, 2 as $t) });
             }
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
-                let val = *elem;
-                check_val!($array,val,max_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
+            #[allow(unused_mut)]
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("full 1: {:?} {:?} {:?}", init_val, val, prevs);
+                    success = false;
+                    break;
                 }
             }
+        }
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, max_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
+            }
+        }
 
-            array.barrier();
-            initialize_array!($array, array, init_val);
+        array.barrier();
+        initialize_array!($array, array, init_val);
 
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        // // sub_array.print();
+        for idx in 0..sub_array.len() {
+            let mut reqs = vec![];
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                reqs.push(unsafe { sub_array.fetch_mul(idx, 2 as $t) });
+            }
+            #[allow(unused_mut)]
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("half 1: {:?} {:?}", val, prevs);
+                    success = false;
+                    break;
+                }
+            }
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, max_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
+            }
+        }
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
 
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
-            // // sub_array.print();
-            for idx in 0..sub_array.len(){
+            for idx in 0..sub_array.len() {
                 let mut reqs = vec![];
-                for _i in 0..(max_updates as usize){
+                for _i in 0..(max_updates as usize) {
                     #[allow(unused_unsafe)]
-                    reqs.push(unsafe{sub_array.fetch_mul(idx,2 as $t)});
+                    reqs.push(unsafe { sub_array.fetch_mul(idx, 2 as $t) });
                 }
                 #[allow(unused_mut)]
-                let mut prevs: std::collections::HashSet<u128>  = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("half 1: {:?} {:?}",val,prevs);
+                let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+                for req in reqs {
+                    let val = world.block_on(req) as u128;
+                    if !insert_prev!($array, val, prevs) {
+                        eprintln!("pe 1: {:?} {:?}", val, prevs);
                         success = false;
                         break;
                     }
@@ -177,59 +214,21 @@ macro_rules! fetch_mul_test{
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
+            for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
                 let val = *elem;
-                check_val!($array,val,max_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
+                check_val!($array, val, max_val, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", i, val, max_val);
                 }
             }
             sub_array.barrier();
             initialize_array!($array, array, init_val);
+        }
 
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                for idx in 0..sub_array.len(){
-                    let mut reqs = vec![];
-                    for _i in 0..(max_updates as usize){
-                        #[allow(unused_unsafe)]
-                        reqs.push(unsafe{sub_array.fetch_mul(idx,2 as $t)});
-                    }
-                    #[allow(unused_mut)]
-                    let mut prevs: std::collections::HashSet<u128>  = std::collections::HashSet::new();
-                    for req in reqs{
-                        let val =  world.block_on(req) as u128;
-                        if ! insert_prev!($array,val,prevs){
-                            eprintln!("pe 1: {:?} {:?}",val,prevs);
-                            success = false;
-                            break;
-                        }
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
-                    let val = *elem;
-                    check_val!($array,val,max_val,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
-                    }
-                }
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {
diff --git a/tests/array/arithmetic_ops/fetch_sub_test.rs b/tests/array/arithmetic_ops/fetch_sub_test.rs
index eec3d49a..9ea5b746 100644
--- a/tests/array/arithmetic_ops/fetch_sub_test.rs
+++ b/tests/array/arithmetic_ops/fetch_sub_test.rs
@@ -95,104 +95,174 @@ macro_rules! onesided_iter {
     };
 }
 
-macro_rules! fetch_sub_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
+macro_rules! fetch_sub_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
 
-            let mut rng = rand::thread_rng();
-            let rand_idx = Uniform::from(0..array_total_len);
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+        let mut rng = rand::thread_rng();
+        let rand_idx = Uniform::from(0..array_total_len);
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let pe_max_val: $t = 10 as $t;
-            let max_val = pe_max_val * num_pes as $t;
-            let init_val = max_val as $t;
-            #[allow(unused)]
-            let zero = 0 as $t;
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            for idx in 0..array.len(){
-                let mut reqs = vec![];
-                for _i in 0..(pe_max_val as usize){
-                    #[allow(unused_unsafe)]
-                    reqs.push(unsafe{array.fetch_sub(idx,1 as $t)});
-                }
-                #[allow(unused_mut)]
-                let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("full 1: {:?} {:?} {:?}",init_val,val,prevs);
-                        success = false;
-                        break;
-                    }
-                }
+        let pe_max_val: $t = 10 as $t;
+        let max_val = pe_max_val * num_pes as $t;
+        let init_val = max_val as $t;
+        #[allow(unused)]
+        let zero = 0 as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        for idx in 0..array.len() {
+            let mut reqs = vec![];
+            for _i in 0..(pe_max_val as usize) {
+                #[allow(unused_unsafe)]
+                reqs.push(unsafe { array.fetch_sub(idx, 1 as $t) });
             }
-            array.barrier();
-            #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
-                let val = *elem;
-                check_val!($array,val,zero,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
+            #[allow(unused_mut)]
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("full 1: {:?} {:?} {:?}", init_val, val, prevs);
+                    success = false;
+                    break;
                 }
             }
-            array.barrier();
-            let num_updates=max_updates!($t,num_pes);
-            let tot_updates = (num_updates*num_pes) as $t;
-            initialize_array!($array, array, tot_updates);
-            array.wait_all();
-            array.barrier();
-            // let mut prev_vals = vec![tot_updates as $t;array.len()];
+        }
+        array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, zero, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
+            }
+        }
+        array.barrier();
+        let num_updates = max_updates!($t, num_pes);
+        let tot_updates = (num_updates * num_pes) as $t;
+        initialize_array!($array, array, tot_updates);
+        array.wait_all();
+        array.barrier();
+        // let mut prev_vals = vec![tot_updates as $t;array.len()];
+
+        let mut reqs = vec![];
+        // println!("2------------");
+        for _i in 0..num_updates {
+            let idx = rand_idx.sample(&mut rng);
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { array.fetch_sub(idx, 1 as $t) }, idx))
+        }
+        for (req, _idx) in reqs {
+            let _val = world.block_on(req);
+        }
 
+        array.barrier();
+        #[allow(unused_unsafe)]
+        let sum = unsafe {
+            onesided_iter!($array, array)
+                .into_iter()
+                .fold(0, |acc, x| acc + *x as usize)
+        };
+        let calced_sum = tot_updates as usize * (array.len() - 1);
+        check_val!($array, sum, calced_sum, success);
+        if !success {
+            eprintln!("{:?} {:?} {:?}", sum, calced_sum, (array.len() - 1));
+        }
+        world.wait_all();
+        world.barrier();
+        initialize_array!($array, array, init_val);
+
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let rand_idx = Uniform::from(0..half_len);
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        for idx in 0..sub_array.len() {
             let mut reqs = vec![];
-            // println!("2------------");
-            for _i in 0..num_updates{
-                let idx = rand_idx.sample(&mut rng);
+            for _i in 0..(pe_max_val as usize) {
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{array.fetch_sub(idx,1 as $t)},idx))
+                reqs.push(unsafe { sub_array.fetch_sub(idx, 1 as $t) });
             }
-            for (req,_idx) in reqs{
-                let _val =  world.block_on(req);
+            #[allow(unused_mut)]
+            let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
+            for req in reqs {
+                let val = world.block_on(req) as u128;
+                if !insert_prev!($array, val, prevs) {
+                    eprintln!("half 1: {:?} {:?}", val, prevs);
+                    success = false;
+                    break;
+                }
             }
-
-            array.barrier();
-            #[allow(unused_unsafe)]
-            let sum = unsafe {onesided_iter!($array,array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-            let calced_sum = tot_updates as usize  * (array.len()-1);
-            check_val!($array,sum,calced_sum,success);
-            if !success{
-                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(array.len()-1));
+        }
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, zero, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
             }
-            world.wait_all();
-            world.barrier();
-            initialize_array!($array, array, init_val);
-
-
+        }
+        sub_array.barrier();
+        let num_updates = max_updates!($t, num_pes);
+        let tot_updates = (num_updates * num_pes) as $t;
+        initialize_array!($array, array, tot_updates);
+        sub_array.wait_all();
+        sub_array.barrier();
+        // let mut prev_vals = vec![tot_updates ;sub_array.len()];
+        let mut reqs = vec![];
+        // println!("2------------");
+        for _i in 0..num_updates {
+            let idx = rand_idx.sample(&mut rng);
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { sub_array.fetch_sub(idx, 1 as $t) }, idx))
+        }
+        for (req, _idx) in reqs {
+            let _val = world.block_on(req);
+        }
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        let sum = unsafe {
+            onesided_iter!($array, sub_array)
+                .into_iter()
+                .fold(0, |acc, x| acc + *x as usize)
+        };
+        let calced_sum = tot_updates as usize * (sub_array.len() - 1);
+        check_val!($array, sum, calced_sum, success);
+        if !success {
+            eprintln!("{:?} {:?} {:?}", sum, calced_sum, (sub_array.len() - 1));
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
 
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
-            let rand_idx = Uniform::from(0..half_len);
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
+            let rand_idx = Uniform::from(0..len);
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
-            for idx in 0..sub_array.len(){
+            for idx in 0..sub_array.len() {
                 let mut reqs = vec![];
-                for _i in 0..(pe_max_val as usize){
+                for _i in 0..(pe_max_val as usize) {
                     #[allow(unused_unsafe)]
-                    reqs.push(unsafe{sub_array.fetch_sub(idx,1 as $t)});
+                    reqs.push(unsafe { sub_array.fetch_sub(idx, 1 as $t) });
                 }
                 #[allow(unused_mut)]
                 let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                for req in reqs{
-                    let val =  world.block_on(req) as u128;
-                    if ! insert_prev!($array,val,prevs){
-                        eprintln!("half 1: {:?} {:?}",val,prevs);
+                for req in reqs {
+                    let val = world.block_on(req) as u128;
+                    if !insert_prev!($array, val, prevs) {
+                        eprintln!("pe 1: {:?} {:?}", val, prevs);
                         success = false;
                         break;
                     }
@@ -200,112 +270,51 @@ macro_rules! fetch_sub_test{
             }
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
+            for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
                 let val = *elem;
-                check_val!($array,val,zero,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
+                check_val!($array, val, zero, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", i, val, max_val);
                 }
             }
             sub_array.barrier();
-            let num_updates=max_updates!($t,num_pes);
-            let tot_updates = (num_updates*num_pes) as $t;
+            let num_updates = max_updates!($t, num_pes);
+            let tot_updates = (num_updates * num_pes) as $t;
             initialize_array!($array, array, tot_updates);
             sub_array.wait_all();
             sub_array.barrier();
-            // let mut prev_vals = vec![tot_updates ;sub_array.len()];
             let mut reqs = vec![];
             // println!("2------------");
-            for _i in 0..num_updates{
+            for _i in 0..num_updates {
                 let idx = rand_idx.sample(&mut rng);
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{sub_array.fetch_sub(idx,1 as $t)},idx))
+                reqs.push((unsafe { sub_array.fetch_sub(idx, 1 as $t) }, idx))
             }
-            for (req,_idx) in reqs{
-                let _val =  world.block_on(req);
+            for (req, _idx) in reqs {
+                let _val = world.block_on(req);
             }
+
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-            let calced_sum = tot_updates as usize  * (sub_array.len()-1);
-            check_val!($array,sum,calced_sum,success);
-            if !success{
-                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
+            let sum = unsafe {
+                onesided_iter!($array, sub_array)
+                    .into_iter()
+                    .fold(0, |acc, x| acc + *x as usize)
+            };
+            let calced_sum = tot_updates as usize * (sub_array.len() - 1);
+            check_val!($array, sum, calced_sum, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", sum, calced_sum, (sub_array.len() - 1));
             }
             sub_array.wait_all();
             sub_array.barrier();
             initialize_array!($array, array, init_val);
+        }
 
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let rand_idx = Uniform::from(0..len);
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                for idx in 0..sub_array.len(){
-                    let mut reqs = vec![];
-                    for _i in 0..(pe_max_val as usize){
-                        #[allow(unused_unsafe)]
-                        reqs.push(unsafe{sub_array.fetch_sub(idx,1 as $t)});
-                    }
-                    #[allow(unused_mut)]
-                    let mut prevs: std::collections::HashSet<u128> = std::collections::HashSet::new();
-                    for req in reqs{
-                        let val =  world.block_on(req) as u128;
-                        if ! insert_prev!($array,val,prevs){
-                            eprintln!("pe 1: {:?} {:?}",val,prevs);
-                            success = false;
-                            break;
-                        }
-                    }
-                }
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
-                    let val = *elem;
-                    check_val!($array,val,zero,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
-                    }
-                }
-                sub_array.barrier();
-                let num_updates=max_updates!($t,num_pes);
-                let tot_updates = (num_updates*num_pes) as $t;
-                initialize_array!($array, array, tot_updates);
-                sub_array.wait_all();
-                sub_array.barrier();
-                let mut reqs = vec![];
-                // println!("2------------");
-                for _i in 0..num_updates{
-                    let idx = rand_idx.sample(&mut rng);
-                    #[allow(unused_unsafe)]
-                    reqs.push((unsafe{sub_array.fetch_sub(idx,1 as $t)},idx))
-                }
-                for (req,_idx) in reqs{
-                    let _val =  world.block_on(req);
-                }
-
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-                let calced_sum = tot_updates as usize  * (sub_array.len()-1);
-                check_val!($array,sum,calced_sum,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {
diff --git a/tests/array/arithmetic_ops/mul_test.rs b/tests/array/arithmetic_ops/mul_test.rs
index 76a7d57f..55817cee 100644
--- a/tests/array/arithmetic_ops/mul_test.rs
+++ b/tests/array/arithmetic_ops/mul_test.rs
@@ -83,104 +83,102 @@ macro_rules! onesided_iter {
     };
 }
 
-macro_rules! mul_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+macro_rules! mul_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let max_updates = max_updates!($t,num_pes);
-            let max_val =  2u128.pow((max_updates*num_pes) as u32) as $t;
-            let init_val = 1 as $t;
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            for idx in 0..array.len(){
-                for _i in 0..(max_updates as usize){
-                    #[allow(unused_unsafe)]
-                    let _ = unsafe{array.mul(idx,2 as $t).spawn()};
-                }
+        let max_updates = max_updates!($t, num_pes);
+        let max_val = 2u128.pow((max_updates * num_pes) as u32) as $t;
+        let init_val = 1 as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        for idx in 0..array.len() {
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                let _ = unsafe { array.mul(idx, 2 as $t).spawn() };
             }
-            array.wait_all();
-            array.barrier();
-            // array.print();
-            #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
-                let val = *elem;
-                check_val!($array,val,max_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
-                }
+        }
+        array.wait_all();
+        array.barrier();
+        // array.print();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, max_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
             }
+        }
 
-            array.barrier();
-            initialize_array!($array, array, init_val);
+        array.barrier();
+        initialize_array!($array, array, init_val);
 
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        // // sub_array.print();
+        for idx in 0..sub_array.len() {
+            for _i in 0..(max_updates as usize) {
+                #[allow(unused_unsafe)]
+                let _ = unsafe { sub_array.mul(idx, 2 as $t).spawn() };
+            }
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, max_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
+            }
+        }
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
 
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
-            // // sub_array.print();
-            for idx in 0..sub_array.len(){
-                for _i in 0..(max_updates as usize){
+            for idx in 0..sub_array.len() {
+                for _i in 0..(max_updates as usize) {
                     #[allow(unused_unsafe)]
-                    let _ =  unsafe{sub_array.mul(idx,2 as $t).spawn()};
+                    let _ = unsafe { sub_array.mul(idx, 2 as $t).spawn() };
                 }
             }
             sub_array.wait_all();
             sub_array.barrier();
             #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
+            for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
                 let val = *elem;
-                check_val!($array,val,max_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
+                check_val!($array, val, max_val, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", i, val, max_val);
                 }
             }
             sub_array.barrier();
             initialize_array!($array, array, init_val);
+        }
 
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                for idx in 0..sub_array.len(){
-                    for _i in 0..(max_updates as usize){
-                        #[allow(unused_unsafe)]
-                        let _ = unsafe{sub_array.mul(idx,2 as $t).spawn()};
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
-                    let val = *elem;
-                    check_val!($array,val,max_val,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
-                    }
-                }
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {
diff --git a/tests/array/arithmetic_ops/sub_test.rs b/tests/array/arithmetic_ops/sub_test.rs
index 84d89aed..64a85a55 100644
--- a/tests/array/arithmetic_ops/sub_test.rs
+++ b/tests/array/arithmetic_ops/sub_test.rs
@@ -79,176 +79,185 @@ macro_rules! onesided_iter {
     };
 }
 
-macro_rules! sub_test{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let _my_pe = world.my_pe();
-            let array_total_len = $len;
+macro_rules! sub_test {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let _my_pe = world.my_pe();
+        let array_total_len = $len;
 
-            let mut rng = rand::thread_rng();
-            let rand_idx = Uniform::from(0..array_total_len);
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+        let mut rng = rand::thread_rng();
+        let rand_idx = Uniform::from(0..array_total_len);
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let pe_max_val: $t = 100 as $t;
-            let max_val = pe_max_val * num_pes as $t;
-            let init_val = max_val as $t;
-            #[allow(unused)]
-            let zero = 0 as $t;
-                        initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
+        let pe_max_val: $t = 100 as $t;
+        let max_val = pe_max_val * num_pes as $t;
+        let init_val = max_val as $t;
+        #[allow(unused)]
+        let zero = 0 as $t;
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
 
-                        for idx in 0..array.len(){
-                for _i in 0..(pe_max_val as usize){
-                    #[allow(unused_unsafe)]
-                    let _ = unsafe{array.sub(idx,1 as $t).spawn()};
-                }
+        for idx in 0..array.len() {
+            for _i in 0..(pe_max_val as usize) {
+                #[allow(unused_unsafe)]
+                let _ = unsafe { array.sub(idx, 1 as $t).spawn() };
             }
-            array.wait_all();
-            array.barrier();
-                        #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,array).into_iter().enumerate()}{
-                let val = *elem;
-                check_val!($array,val,zero,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
-                }
+        }
+        array.wait_all();
+        array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, zero, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
             }
-            array.barrier();
-                        let num_updates=max_updates!($t,num_pes);
-            let tot_updates = (num_updates*num_pes) as $t;
-            initialize_array!($array, array, tot_updates);
-            array.wait_all();
-            array.barrier();
+        }
+        array.barrier();
+        let num_updates = max_updates!($t, num_pes);
+        let tot_updates = (num_updates * num_pes) as $t;
+        initialize_array!($array, array, tot_updates);
+        array.wait_all();
+        array.barrier();
 
-                        for _i in 0..num_updates  as usize{
-                let idx = rand_idx.sample(&mut rng);
+        for _i in 0..num_updates as usize {
+            let idx = rand_idx.sample(&mut rng);
+            #[allow(unused_unsafe)]
+            let _ = unsafe { array.sub(idx, 1 as $t).spawn() };
+        }
+        array.wait_all();
+        array.barrier();
+        #[allow(unused_unsafe)]
+        let sum = unsafe {
+            onesided_iter!($array, array)
+                .into_iter()
+                .fold(0, |acc, x| acc + *x as usize)
+        };
+        let calced_sum = tot_updates as usize * (array.len() - 1);
+        check_val!($array, sum, calced_sum, success);
+        if !success {
+            eprintln!("{:?} {:?} {:?}", sum, calced_sum, (array.len() - 1));
+        }
+        world.wait_all();
+        world.barrier();
+        initialize_array!($array, array, init_val);
+
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let rand_idx = Uniform::from(0..half_len);
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+        // sub_array.print();
+        for idx in 0..sub_array.len() {
+            for _i in 0..(pe_max_val as usize) {
                 #[allow(unused_unsafe)]
-                let _ = unsafe{array.sub(idx,1 as $t).spawn()};
+                let _ = unsafe { sub_array.sub(idx, 1 as $t).spawn() };
             }
-            array.wait_all();
-            array.barrier();
-                        #[allow(unused_unsafe)]
-            let sum = unsafe{onesided_iter!($array,array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-            let calced_sum = tot_updates as usize  * (array.len()-1);
-            check_val!($array,sum,calced_sum,success);
-            if !success{
-                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(array.len()-1));
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
+            let val = *elem;
+            check_val!($array, val, zero, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", i, val, max_val);
             }
-            world.wait_all();
-            world.barrier();
-                        initialize_array!($array, array, init_val);
-
+        }
+        sub_array.barrier();
+        let num_updates = max_updates!($t, num_pes);
+        let tot_updates = (num_updates * num_pes) as $t;
+        initialize_array!($array, array, tot_updates);
+        sub_array.wait_all();
+        sub_array.barrier();
 
+        for _i in 0..num_updates as usize {
+            let idx = rand_idx.sample(&mut rng);
+            #[allow(unused_unsafe)]
+            let _ = unsafe { sub_array.sub(idx, 1 as $t).spawn() };
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        #[allow(unused_unsafe)]
+        let sum = unsafe {
+            onesided_iter!($array, sub_array)
+                .into_iter()
+                .fold(0, |acc, x| acc + *x as usize)
+        };
+        let calced_sum = tot_updates as usize * (sub_array.len() - 1);
+        check_val!($array, sum, calced_sum, success);
+        if !success {
+            eprintln!("{:?} {:?} {:?}", sum, calced_sum, (sub_array.len() - 1));
+        }
+        sub_array.wait_all();
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
 
-                        let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
-            let rand_idx = Uniform::from(0..half_len);
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
+            let rand_idx = Uniform::from(0..len);
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
-                        // sub_array.print();
-            for idx in 0..sub_array.len(){
-                for _i in 0..(pe_max_val as usize){
+            for idx in 0..sub_array.len() {
+                for _i in 0..(pe_max_val as usize) {
                     #[allow(unused_unsafe)]
-                    let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
+                    let _ = unsafe { sub_array.sub(idx, 1 as $t).spawn() };
                 }
             }
             sub_array.wait_all();
             sub_array.barrier();
-                        #[allow(unused_unsafe)]
-            for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
+            #[allow(unused_unsafe)]
+            for (i, elem) in unsafe { onesided_iter!($array, sub_array).into_iter().enumerate() } {
                 let val = *elem;
-                check_val!($array,val,zero,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",i,val,max_val);
+                check_val!($array, val, zero, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", i, val, max_val);
                 }
             }
             sub_array.barrier();
-                        let num_updates=max_updates!($t,num_pes);
-            let tot_updates = (num_updates*num_pes) as $t;
+            let num_updates = max_updates!($t, num_pes);
+            let tot_updates = (num_updates * num_pes) as $t;
             initialize_array!($array, array, tot_updates);
             sub_array.wait_all();
             sub_array.barrier();
 
-                        for _i in 0..num_updates as usize{
+            for _i in 0..num_updates as usize {
                 let idx = rand_idx.sample(&mut rng);
                 #[allow(unused_unsafe)]
-                let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
+                let _ = unsafe { sub_array.sub(idx, 1 as $t).spawn() };
             }
             sub_array.wait_all();
             sub_array.barrier();
-                        #[allow(unused_unsafe)]
-            let sum = unsafe {onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-            let calced_sum = tot_updates as usize  * (sub_array.len()-1);
-            check_val!($array,sum,calced_sum,success);
-            if !success{
-                eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
+            #[allow(unused_unsafe)]
+            let sum = unsafe {
+                onesided_iter!($array, sub_array)
+                    .into_iter()
+                    .fold(0, |acc, x| acc + *x as usize)
+            };
+            let calced_sum = tot_updates as usize * (sub_array.len() - 1);
+            check_val!($array, sum, calced_sum, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", sum, calced_sum, (sub_array.len() - 1));
             }
             sub_array.wait_all();
             sub_array.barrier();
-                        initialize_array!($array, array, init_val);
-
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let rand_idx = Uniform::from(0..len);
-                                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-                                for idx in 0..sub_array.len(){
-                    for _i in 0..(pe_max_val as usize){
-                        #[allow(unused_unsafe)]
-                        let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
-                    }
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                                #[allow(unused_unsafe)]
-                for (i,elem) in unsafe{onesided_iter!($array,sub_array).into_iter().enumerate()}{
-                    let val = *elem;
-                    check_val!($array,val,zero,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",i,val,max_val);
-                    }
-                }
-                sub_array.barrier();
-                                let num_updates=max_updates!($t,num_pes);
-                let tot_updates = (num_updates*num_pes) as $t;
-                initialize_array!($array, array, tot_updates);
-                sub_array.wait_all();
-                sub_array.barrier();
-
-                                for _i in 0..num_updates as usize{
-                    let idx = rand_idx.sample(&mut rng);
-                    #[allow(unused_unsafe)]
-                    let _ = unsafe{sub_array.sub(idx,1 as $t).spawn()};
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                                #[allow(unused_unsafe)]
-                let sum = unsafe{onesided_iter!($array,sub_array).into_iter().fold(0,|acc,x| acc+ *x as usize)};
-                let calced_sum = tot_updates as usize  * (sub_array.len()-1);
-                check_val!($array,sum,calced_sum,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",sum,calced_sum,(sub_array.len()-1));
-                }
-                sub_array.wait_all();
-                sub_array.barrier();
-                                initialize_array!($array, array, init_val);
-                            }
+            initialize_array!($array, array, init_val);
+        }
 
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {
diff --git a/tests/array/atomic_ops/swap_test.rs b/tests/array/atomic_ops/swap_test.rs
index f1139fb0..e055ed59 100644
--- a/tests/array/atomic_ops/swap_test.rs
+++ b/tests/array/atomic_ops/swap_test.rs
@@ -55,80 +55,125 @@ macro_rules! check_val {
     };
 }
 
-macro_rules! swap{
-    ($array:ident, $t:ty, $len:expr, $dist:ident) =>{
-       {
-            let world = lamellar::LamellarWorldBuilder::new().build();
-            let num_pes = world.num_pes();
-            let my_pe = world.my_pe();
-            let array_total_len = $len;
-            #[allow(unused_mut)]
-            let mut success = true;
-            let array: $array::<$t> = $array::<$t>::new(world.team(), array_total_len, $dist).block().into(); //convert into abstract LamellarArray, distributed len is total_len
+macro_rules! swap {
+    ($array:ident, $t:ty, $len:expr, $dist:ident) => {{
+        let world = lamellar::LamellarWorldBuilder::new().build();
+        let num_pes = world.num_pes();
+        let my_pe = world.my_pe();
+        let array_total_len = $len;
+        #[allow(unused_mut)]
+        let mut success = true;
+        let array: $array<$t> = $array::<$t>::new(world.team(), array_total_len, $dist)
+            .block()
+            .into(); //convert into abstract LamellarArray, distributed len is total_len
 
-            let init_val =(num_pes as $t);
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
+        let init_val = (num_pes as $t);
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
 
-            let mut reqs = vec![];
-            for idx in 0..array.len(){
-                if idx%num_pes == my_pe{
-                    #[allow(unused_unsafe)]
-                    reqs.push((unsafe{array.swap(idx,my_pe as $t)},idx));
-                }
+        let mut reqs = vec![];
+        for idx in 0..array.len() {
+            if idx % num_pes == my_pe {
+                #[allow(unused_unsafe)]
+                reqs.push((unsafe { array.swap(idx, my_pe as $t) }, idx));
             }
-            for (req,idx) in reqs{
-                let val =  world.block_on(req);
-                check_val!($array,val,init_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",idx,val,init_val);
-                }
+        }
+        for (req, idx) in reqs {
+            let val = world.block_on(req);
+            check_val!($array, val, init_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", idx, val, init_val);
             }
+        }
 
-            array.wait_all();
-            array.barrier();
+        array.wait_all();
+        array.barrier();
 
-            let mut reqs = vec![];
-            for idx in 0..array.len(){
+        let mut reqs = vec![];
+        for idx in 0..array.len() {
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { array.load(idx) }, idx));
+        }
+        for (req, idx) in reqs {
+            let val = world.block_on(req);
+            let check_val = (idx % num_pes) as $t;
+            let val = val;
+            check_val!($array, val, check_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", idx, val, check_val);
+            }
+        }
+
+        array.barrier();
+        initialize_array!($array, array, init_val);
+        array.wait_all();
+        array.barrier();
+
+        let half_len = array_total_len / 2;
+        let start_i = half_len / 2;
+        let end_i = start_i + half_len;
+        let sub_array = array.sub_array(start_i..end_i);
+        sub_array.barrier();
+
+        let mut reqs = vec![];
+        for idx in 0..sub_array.len() {
+            if idx % num_pes == my_pe {
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{array.load(idx)},idx));
+                reqs.push((unsafe { sub_array.swap(idx, my_pe as $t) }, idx));
             }
-            for (req,idx) in reqs{
-                let val =  world.block_on(req);
-                let check_val = (idx%num_pes) as $t;
-                let val = val;
-                check_val!($array,val,check_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
-                }
+        }
+        for (req, idx) in reqs {
+            let val = world.block_on(req);
+            check_val!($array, val, init_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", idx, val, init_val);
             }
+        }
 
-            array.barrier();
-            initialize_array!($array, array, init_val);
-            array.wait_all();
-            array.barrier();
+        sub_array.wait_all();
+        sub_array.barrier();
 
+        let mut reqs = vec![];
+        for idx in 0..sub_array.len() {
+            #[allow(unused_unsafe)]
+            reqs.push((unsafe { sub_array.load(idx) }, idx));
+        }
+        for (req, idx) in reqs {
+            let val = world.block_on(req);
+            let check_val = (idx % num_pes) as $t;
+            let val = val;
+            check_val!($array, val, check_val, success);
+            if !success {
+                eprintln!("{:?} {:?} {:?}", idx, val, check_val);
+            }
+        }
 
+        sub_array.barrier();
+        initialize_array!($array, array, init_val);
+        sub_array.wait_all();
+        sub_array.barrier();
 
-            let half_len = array_total_len/2;
-            let start_i = half_len/2;
-            let end_i = start_i + half_len;
+        let pe_len = array_total_len / num_pes;
+        for pe in 0..num_pes {
+            let len = std::cmp::max(pe_len / 2, 1);
+            let start_i = (pe * pe_len) + len / 2;
+            let end_i = start_i + len;
             let sub_array = array.sub_array(start_i..end_i);
             sub_array.barrier();
 
             let mut reqs = vec![];
-            for idx in 0..sub_array.len(){
-                if idx%num_pes == my_pe{
+            for idx in 0..sub_array.len() {
+                if idx % num_pes == my_pe {
                     #[allow(unused_unsafe)]
-                    reqs.push((unsafe{sub_array.swap(idx,my_pe as $t)},idx));
+                    reqs.push((unsafe { sub_array.swap(idx, my_pe as $t) }, idx));
                 }
             }
-            for (req,idx) in reqs{
-                let val =  world.block_on(req);
-                check_val!($array,val,init_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",idx,val,init_val);
+            for (req, idx) in reqs {
+                let val = world.block_on(req);
+                check_val!($array, val, init_val, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", idx, val, init_val);
                 }
             }
 
@@ -136,17 +181,17 @@ macro_rules! swap{
             sub_array.barrier();
 
             let mut reqs = vec![];
-            for idx in 0..sub_array.len(){
+            for idx in 0..sub_array.len() {
                 #[allow(unused_unsafe)]
-                reqs.push((unsafe{sub_array.load(idx)},idx));
+                reqs.push((unsafe { sub_array.load(idx) }, idx));
             }
-            for (req,idx) in reqs{
-                let val =  world.block_on(req);
-                let check_val = (idx%num_pes) as $t;
+            for (req, idx) in reqs {
+                let val = world.block_on(req);
+                let check_val = (idx % num_pes) as $t;
                 let val = val;
-                check_val!($array,val,check_val,success);
-                if !success{
-                    eprintln!("{:?} {:?} {:?}",idx,val,check_val);
+                check_val!($array, val, check_val, success);
+                if !success {
+                    eprintln!("{:?} {:?} {:?}", idx, val, check_val);
                 }
             }
 
@@ -154,62 +199,12 @@ macro_rules! swap{
             initialize_array!($array, array, init_val);
             sub_array.wait_all();
             sub_array.barrier();
+        }
 
-
-
-            let pe_len = array_total_len/num_pes;
-            for pe in 0..num_pes{
-                let len = std::cmp::max(pe_len/2,1);
-                let start_i = (pe*pe_len)+ len/2;
-                let end_i = start_i+len;
-                let sub_array = array.sub_array(start_i..end_i);
-                sub_array.barrier();
-
-                let mut reqs = vec![];
-                for idx in 0..sub_array.len(){
-                    if idx%num_pes == my_pe{
-                        #[allow(unused_unsafe)]
-                        reqs.push((unsafe{sub_array.swap(idx,my_pe as $t)},idx));
-                    }
-                }
-                for (req,idx) in reqs{
-                    let val =  world.block_on(req);
-                    check_val!($array,val,init_val,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",idx,val,init_val);
-                    }
-                }
-
-                sub_array.wait_all();
-                sub_array.barrier();
-
-                let mut reqs = vec![];
-                for idx in 0..sub_array.len(){
-
-                    #[allow(unused_unsafe)]
-                    reqs.push((unsafe{sub_array.load(idx)},idx));
-                }
-                for (req,idx) in reqs{
-                    let val =  world.block_on(req);
-                    let check_val = (idx%num_pes) as $t;
-                    let val = val;
-                check_val!($array,val,check_val,success);
-                    if !success{
-                        eprintln!("{:?} {:?} {:?}",idx,val,check_val);
-                    }
-                }
-
-                sub_array.barrier();
-                initialize_array!($array, array, init_val);
-                sub_array.wait_all();
-                sub_array.barrier();
-            }
-
-            if !success{
-                eprintln!("failed");
-            }
+        if !success {
+            eprintln!("failed");
         }
-    }
+    }};
 }
 
 fn main() {

From 19802e763965b3e688bb1ba4ed47bc61dcd22059 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 13 Nov 2024 09:40:16 -0800
Subject: [PATCH 110/116] fix doc test

---
 src/array/operations/shift.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array/operations/shift.rs b/src/array/operations/shift.rs
index afe5e99e..78103d34 100644
--- a/src/array/operations/shift.rs
+++ b/src/array/operations/shift.rs
@@ -384,7 +384,7 @@ pub trait ShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
 /// let array = UnsafeArray::<usize>::new(&world,100,Distribution::Block).block();
 ///
 /// let indices = vec![3,54,12,88,29,68];
-/// array.block_on(array.batch_fetch_shl(indices,2));
+/// unsafe {array.batch_fetch_shl(indices,2).block()};
 ///```
 pub trait UnsafeShiftOps<T: ElementShiftOps>: private::LamellarArrayPrivate<T> {
     /// This call performs an in place left shift of `val` bits on the element specified by `index`.

From 28552fdffa3deae7d366e2c961124ef173cc0681 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 13 Nov 2024 22:10:54 -0800
Subject: [PATCH 111/116] remove some printlns

---
 src/lamellar_task_group.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 0fe653c5..7a2a2d18 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -1757,9 +1757,9 @@ pub struct TypedAmGroupResultIter<'a, T> {
 impl<'a, T> Iterator for TypedAmGroupResultIter<'a, T> {
     type Item = AmGroupResult<'a, T>;
     fn next(&mut self) -> Option<Self::Item> {
-        if self.index % 10000 == 0 {
-            println!("TypedAmGroupResultIter index: {}", self.index);
-        }
+        // if self.index % 10000 == 0 {
+        //     println!("TypedAmGroupResultIter index: {}", self.index);
+        // }
         if self.index < self.results.len() {
             let index = self.index;
             self.index += 1;

From 819687ae4fbeecc87806f7434ac2028f8766415d Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Wed, 13 Nov 2024 22:47:59 -0800
Subject: [PATCH 112/116] prep for v0.7.0-rc.1

---
 Cargo.toml      |  4 +--
 README.md       | 69 +++++++++++++++++++++++++++++++++++--------------
 impl/Cargo.toml |  2 +-
 src/lib.rs      | 13 ++++------
 4 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 49f151a8..64312780 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lamellar"
-version = "0.7.0"
+version = "0.7.0-rc.1"
 authors = ["Ryan D. Friese <ryan.friese@pnnl.gov>",  "Roberto Gioiosa <roberto.gioiosa@pnnl.gov>", "Joseph Cottam <joseph.cottam@pnnl.gov>","Greg Roek <gregory.roek@pnnl.gov>","Erdal Mutlu <erdal.mutlu@pnnl.gov>"]
 edition = "2021"
 description = "Lamellar is an asynchronous tasking runtime for HPC systems developed in RUST."
@@ -12,7 +12,7 @@ keywords = ["hpc","runtime","pgas","distributed","asynchronous"]
 categories = ["asynchronous","concurrency", "network-programming","science"]
 
 [dependencies]
-lamellar-impl = { version = "0.7.0", path = "impl" }
+lamellar-impl = { version = "0.7.0-rc.1", path = "impl" }
 #rofisys = { version ="0.3", optional = true }
 rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
 inventory = "0.3"
diff --git a/README.md b/README.md
index 780929bf..96ae763c 100644
--- a/README.md
+++ b/README.md
@@ -63,8 +63,26 @@ Additional information on using each of the lamellae backends can be found below
 
 # Environment Variables
 
+Please see [env_var.rs] for a description of available environment variables.
+
+Commonly used variables include:
+ - `LAMELLAR_THREADS` - The number of worker threads used within a lamellar PE, defaults to [std::thread::available_parallelism] if available or else 4
+ - `LAMELLAR_BACKEND` - the backend used during execution. Note that if a backend is explicitly set in the world builder, this variable is ignored.
+     - possible values
+         - `local` -- default (if `enable-local` feature is not active)
+         - `shmem`
+         - `rofi`  -- only available with the `enable-rofi` feature in which case it is the default backend
+ - `LAMELLAR_EXECUTOR` - the executor used during execution. Note that if a executor is explicitly set in the world builder, this variable is ignored.
+     - possible values
+         - `lamellar` -- default, work stealing backend
+         - `async_std` -- alternative backend from async_std
+         - `tokio` -- only available with the `tokio-executor` feature in which case it is the default executor
+
+
 Examples 
 --------
+All of the examples in the [documentation](https://docs.rs/lamellar/latest/lamellar) should also be valid Lamellar programs (please open an issue if you encounter an issue).
+
 Our repository also provides numerous examples highlighting various features of the runtime: <https://github.com/pnnl/lamellar-runtime/tree/master/examples>
 
 Additionally, we are compiling a set of benchmarks (some with multiple implementations) that may be helpful to look at as well: <https://github.com/pnnl/lamellar-benchmarks/>
@@ -114,12 +132,12 @@ fn main(){
     let num_pes = world.num_pes();
     let am = HelloWorld { my_pe: my_pe };
     for pe in 0..num_pes{
-        world.exec_am_pe(pe,am.clone()); // explicitly launch on each PE
+        world.exec_am_pe(pe,am.clone()).spawn(); // explicitly launch on each PE
     }
     world.wait_all(); // wait for all active messages to finish
     world.barrier();  // synchronize with other PEs
     let request = world.exec_am_all(am.clone()); //also possible to execute on every PE with a single call
-    world.block_on(request); //both exec_am_all and exec_am_pe return futures that can be used to wait for completion and access any returned result
+    request.block(); //both exec_am_all and exec_am_pe return futures that can be used to wait for completion and access any returned result
 }
 ```
 
@@ -131,9 +149,8 @@ use lamellar::array::prelude::*;
 fn main(){
     let world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
-    let block_array = AtomicArray::<usize>::new(&world, 1000, Distribution::Block); //we also support Cyclic distribution.
-    block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| elem.store(i) ); //simultaneosuly initialize array accross all pes, each pe only updates its local data
-    block_array.wait_all();
+    let block_array = AtomicArray::<usize>::new(&world, 1000, Distribution::Block).block(); //we also support Cyclic distribution.
+    block_array.dist_iter_mut().enumerate().for_each(move |(i,elem)| elem.store(i) ).block(); //simultaneosuly initialize array accross all pes, each pe only updates its local data
     block_array.barrier();
     if my_pe == 0{
         for (i,elem) in block_onesided_iter!($array,array).into_iter().enumerate(){ //iterate through entire array on pe 0 (automatically transfering remote data)
@@ -165,11 +182,11 @@ fn main(){
     let mut world = lamellar::LamellarWorldBuilder::new().build();
     let my_pe = world.my_pe();
     let num_pes = world.num_pes();
-    let cnt = Darc::new(&world, AtomicUsize::new());
+    let cnt = Darc::new(&world, AtomicUsize::new()).block().expect("calling pe is in the world);
     for pe in 0..num_pes{
-        world.exec_am_pe(pe,DarcAm{cnt: cnt.clone()}); // explicitly launch on each PE
+        world.exec_am_pe(pe,DarcAm{cnt: cnt.clone()}).spawn(); // explicitly launch on each PE
     }
-    world.exec_am_all(am.clone()); //also possible to execute on every PE with a single call
+    world.exec_am_all(am.clone()).spawn(); //also possible to execute on every PE with a single call
     cnt.fetch_add(1,Ordering::SeqCst); //this is valid as well!
     world.wait_all(); // wait for all active messages to finish
     world.barrier();  // synchronize with other PEs
@@ -180,11 +197,11 @@ fn main(){
 Lamellar is capable of running on single node workstations as well as distributed HPC systems.
 For a workstation, simply copy the following to the dependency section of you Cargo.toml file:
 
-``` lamellar = "0.6.1" ```
+``` lamellar = "0.7.0-rc.1" ```
 
 If planning to use within a distributed HPC system copy the following to your Cargo.toml file:
 
-```lamellar = { version = "0.6.1", features = ["enable-rofi"]}```
+```lamellar = { version = "0.7.0-rc.1", features = ["enable-rofi"]}```
 
 NOTE: as of Lamellar 0.6.1 It is no longer necessary to manually install Libfabric, the build process will now try to automatically build libfabric for you.
 If this process fails, it is still possible to pass in a manual libfabric installation via the OFI_DIR envrionment variable.
@@ -212,8 +229,16 @@ There are a number of ways to run Lamellar applications, mostly dictated by the
         - `pmi2` library is required to grab info about the allocated nodes and helps set up initial handshakes
 
 
+Repository Organization 
+-----------------------
+
+Generally the 'master' branch corresponds to the latest stable release at [https://crates.io/crates/lamellar] and [https://docs.rs/lamellar/latest/lamellar/].
+The 'dev' branch will contain the most recent 'working' features, where working means all the examples compile and execute properly (but the documentation may not yet be up-to-date).
+All other branches are active feature branches and may or may not be in a working state.
+
 NEWS
 ----
+* November 2024: Alpha release -- v0.7.1
 * February 2023: Alpha release -- v0.6.1
 * November 2023: Alpha release -- v0.6
 * January 2023: Alpha release -- v0.5
@@ -274,6 +299,14 @@ Note: we do an explicit build instead of `cargo run --examples` as they are inte
 
 HISTORY
 -------
+- version 0.7.0
+  - add support for integration with various async executor backends including tokio and async-std
+  - 'handle' based api, allowing for 'spawn()'ing, 'block()'ing, and 'await'ing remote operations.
+  - conversion from `Pin<Box<dyn Future>>` to concrete types for most remote operations.
+  - improved execution time warning framework for potential deadlock, unexecuted remote operations, blocking calls in async code, etc.
+    - can be completely disabled
+    - can panic instead of print warning
+  - various optimizations and bug fixes
 - version 0.6.1
   - Clean up apis for lock based data structures
   - N-way dissemination barrier
@@ -366,15 +399,13 @@ CONTACTS
 
 Current Team Members
 
-Ryan Friese     - ryan.friese@pnnl.gov  
-Roberto Gioiosa - roberto.gioiosa@pnnl.gov
-Erdal Mutlu     - erdal.mutlu@pnnl.gov  
-Joseph Cottam   - joseph.cottam@pnnl.gov
-Greg Roek       - gregory.roek@pnnl.gov
-
-Past Team Members
-
-Mark Raugas     - mark.raugas@pnnl.gov  
+Ryan Friese           - ryan.friese@pnnl.gov  
+Roberto Gioiosa       - roberto.gioiosa@pnnl.gov
+Polykarpos Thomadakis - polykarpos.thomadakis@pnnl.gov
+Erdal Mutlu           - erdal.mutlu@pnnl.gov  
+Joseph Cottam         - joseph.cottam@pnnl.gov
+Greg Roek             - gregory.roek@pnnl.gov
+Mark Raugas           - mark.raugas@pnnl.gov  
 
 ## License
 
diff --git a/impl/Cargo.toml b/impl/Cargo.toml
index 35f66007..cc9339f2 100644
--- a/impl/Cargo.toml
+++ b/impl/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lamellar-impl"
-version = "0.7.0"
+version = "0.7.0-rc.1"
 authors = ["Ryan D. Friese <ryan.friese@pnnl.gov>",  "Roberto Gioiosa <roberto.gioiosa@pnnl.gov>", "Joseph Cottam <joseph.cottam@pnnl.gov>","Greg Roek <gregory.roek@pnnl.gov>","Erdal Mutlu <erdal.mutlu@pnnl.gov>"]
 edition = "2021"
 description = "Lamellar is an asynchronous tasking runtime for HPC systems developed in RUST."
diff --git a/src/lib.rs b/src/lib.rs
index e07ded3d..2bf6fb67 100755
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -183,17 +183,14 @@
 //! Lamellar is capable of running on single node workstations as well as distributed HPC systems.
 //! For a workstation, simply copy the following to the dependency section of you Cargo.toml file:
 //!
-//!``` lamellar = "0.5" ```
+//!``` lamellar = "0.7.0-rc.1" ```
 //!
-//! If planning to use within a distributed HPC system a few more steps may be necessary (this also works on single workstations):
+//! If planning to use within a distributed HPC system copy the following to your Cargo.toml file:
 //!
-//! 1. ensure Libfabric (with support for the verbs provider) is installed on your system <https://github.com/ofiwg/libfabric>
-//! 2. set the OFI_DIR environment variable to the install location of Libfabric, this directory should contain both the following directories:
-//!     * lib
-//!     * include
-//! 3. copy the following to your Cargo.toml file:
+//! ``` lamellar = { version = "0.7.0-rc.1", features = ["enable-rofi"]}```
 //!
-//! ```lamellar = { version = "0.5", features = ["enable-rofi"]}```
+//! NOTE: as of Lamellar 0.6.1 It is no longer necessary to manually install Libfabric, the build process will now try to automatically build libfabric for you.
+//! If this process fails, it is still possible to pass in a manual libfabric installation via the OFI_DIR envrionment variable.
 //!
 //!
 //! For both environments, build your application as normal

From 0d9d14cf767fac500da76f3e05b65cb52a10a074 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 14 Nov 2024 17:52:07 -0800
Subject: [PATCH 113/116] fix task group am_all completion detection

---
 .../active_message_examples/am_no_return.rs   | 24 ++++++++++++++++++
 src/lamellar_task_group.rs                    | 25 +++++++++++++++----
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/examples/active_message_examples/am_no_return.rs b/examples/active_message_examples/am_no_return.rs
index 27db3601..3ba102a7 100644
--- a/examples/active_message_examples/am_no_return.rs
+++ b/examples/active_message_examples/am_no_return.rs
@@ -95,6 +95,30 @@ fn main() {
 
         println!("Task Group---------------------------------------------------------------");
 
+        let task_group = LamellarTaskGroup::new(world.clone());
+        for i in 0..10 {
+            task_group
+                .exec_am_pe(
+                    i % num_pes,
+                    AmNoReturn {
+                        my_pe: i,
+                        test_var: 10 * (i as u16),
+                    },
+                )
+                .block();
+            task_group
+                .exec_am_all(AmNoReturn {
+                    my_pe: i,
+                    test_var: 10 * (i as u16),
+                })
+                .block();
+        }
+        // let res = world.block_on(am_group.exec());
+        for r in res.iter() {
+            println!("PE[{:?}] return result: {:?}", my_pe, r);
+        }
+        println!("Typed Am Group---------------------------------------------------------------");
+
         // let mut am_group = typed_am_group!(AmNoReturn,world.clone());
         // am_group.add_am_all(am.clone());
         // am_group.add_am_pe(0,am.clone());
diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 7a2a2d18..5dd72fc4 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -268,12 +268,26 @@ impl LamellarRequestAddResult for TaskGroupMultiAmHandleInner {
     fn add_result(&self, pe: usize, sub_id: usize, data: InternalResult) {
         let pe = self.arch.team_pe(pe).expect("pe does not exist on team");
         let mut map = self.data.lock(); //.insert(pe, data);
-        map.entry(sub_id)
-            .or_insert_with(|| HashMap::new())
-            .insert(pe, data);
 
-        if let Some(waker) = self.wakers.lock().remove(&sub_id) {
-            waker.wake();
+        let reqs = map.entry(sub_id).or_insert_with(|| HashMap::new());
+        reqs.insert(pe, data);
+
+        if reqs.len() == self.arch.num_pes() {
+            if let Some(waker) = self.wakers.lock().remove(&sub_id) {
+                // println!("0. waker found for sub_id {}", sub_id);
+                waker.wake();
+            } 
+            // else {
+            //     println!("0. no waker found for sub_id {}", sub_id);
+            // }
+        } else {
+            if let Some(waker) = self.wakers.lock().get(&sub_id) {
+                // println!("1. waker found for sub_id {}", sub_id);
+                waker.wake_by_ref();
+            } 
+            // else {
+            //     println!("1. no waker found for sub_id {}", sub_id);
+            // }
         }
     }
     fn update_counters(&self, _sub_id: usize) {
@@ -404,6 +418,7 @@ impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
         if let Some(req) = data.get(&self.sub_id) {
             req.len() == self.inner.arch.num_pes()
         } else {
+            // println!("setting waker for sub_id {}", self.sub_id);
             self.inner.wakers.lock().insert(self.sub_id, waker.clone());
             self.inner
                 .wakers

From b753f5e8eb00c01d4e7983230d1fde2bcb3b0d05 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 14 Nov 2024 17:53:02 -0800
Subject: [PATCH 114/116] better wait_all non spawned handle detection

---
 .../batching/simple_batcher.rs                |  4 +
 .../batching/team_am_batcher.rs               |  4 +
 .../registered_active_message.rs              | 10 +-
 src/lamellar_team.rs                          | 96 +++++++++++--------
 4 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/src/active_messaging/batching/simple_batcher.rs b/src/active_messaging/batching/simple_batcher.rs
index 2be42094..4f2f791a 100644
--- a/src/active_messaging/batching/simple_batcher.rs
+++ b/src/active_messaging/batching/simple_batcher.rs
@@ -553,6 +553,8 @@ impl SimpleBatcher {
         //     std::thread::current().id()
         // );
         let ame = ame.clone();
+        world.team.world_counters.inc_outstanding(1);
+        team.team.team_counters.inc_outstanding(1);
         self.executor.submit_task(async move {
             let am = match am
                 .exec(
@@ -571,6 +573,8 @@ impl SimpleBatcher {
                     panic!("Should not be returning local data or AM from remote  am");
                 }
             };
+            world.team.world_counters.dec_outstanding(1);
+            team.team.team_counters.dec_outstanding(1);
             ame.process_msg(am, 0, false).await;
         });
     }
diff --git a/src/active_messaging/batching/team_am_batcher.rs b/src/active_messaging/batching/team_am_batcher.rs
index c6b25481..6342955e 100644
--- a/src/active_messaging/batching/team_am_batcher.rs
+++ b/src/active_messaging/batching/team_am_batcher.rs
@@ -816,6 +816,8 @@ impl TeamAmBatcher {
         };
 
         let ame = ame.clone();
+        world.team.world_counters.inc_outstanding(1);
+        team.team.team_counters.inc_outstanding(1);
         self.executor.submit_task(async move {
             let am = match am
                 .exec(
@@ -834,6 +836,8 @@ impl TeamAmBatcher {
                     panic!("Should not be returning local data or AM from remote  am");
                 }
             };
+            world.team.world_counters.dec_outstanding(1);
+            team.team.team_counters.dec_outstanding(1);
             ame.process_msg(am, 0, false).await;
         });
     }
diff --git a/src/active_messaging/registered_active_message.rs b/src/active_messaging/registered_active_message.rs
index 67962333..cf0ee9e2 100644
--- a/src/active_messaging/registered_active_message.rs
+++ b/src/active_messaging/registered_active_message.rs
@@ -402,6 +402,8 @@ impl RegisteredActiveMessages {
         team: Arc<LamellarTeam>,
     ) {
         // println!("[{:?}] exec_local_am", std::thread::current().id());
+        world.team.world_counters.inc_outstanding(1);
+         team.team.team_counters.inc_outstanding(1);
         match am
             .exec(
                 req_data.team.world_pe,
@@ -422,7 +424,7 @@ impl RegisteredActiveMessages {
             }
             LamellarReturn::LocalAm(am) => {
                 // println!("[{:?}] local am am return", std::thread::current().id());
-                self.exec_local_am(req_data, am.as_local(), world, team)
+                self.exec_local_am(req_data, am.as_local(), world.clone(), team.clone())
                     .await;
             }
             LamellarReturn::Unit => {
@@ -433,6 +435,8 @@ impl RegisteredActiveMessages {
                 panic!("should not be returning remote data or am from local am");
             }
         }
+        world.team.world_counters.dec_outstanding(1);
+         team.team.team_counters.dec_outstanding(1);
     }
 
     //#[tracing::instrument(skip_all)]
@@ -463,6 +467,8 @@ impl RegisteredActiveMessages {
             team_addr: team.team.remote_ptr_addr,
         };
 
+        world.team.world_counters.inc_outstanding(1);
+         team.team.team_counters.inc_outstanding(1);
         let am = match am
             .exec(
                 team.team.world_pe,
@@ -484,6 +490,8 @@ impl RegisteredActiveMessages {
         self.executor.submit_task(async move {
             ame.process_msg(am, 0, false).await;
         });
+        world.team.world_counters.dec_outstanding(1);
+         team.team.team_counters.dec_outstanding(1);
         //compare against:
         // ame.process_msg(am, 0, true).await;
     }
diff --git a/src/lamellar_team.rs b/src/lamellar_team.rs
index 23c4b3d0..eb9d4a99 100644
--- a/src/lamellar_team.rs
+++ b/src/lamellar_team.rs
@@ -1429,53 +1429,69 @@ impl LamellarTeamRT {
         //     self.team_counters.send_req_cnt.load(Ordering::SeqCst),
         //     self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
         // );
-        while self.panic.load(Ordering::SeqCst) == 0
-            && ((self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
-                || orig_reqs != self.team_counters.send_req_cnt.load(Ordering::SeqCst)
-                || orig_launched != self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
+        let mut done = false;
+        while !done {
+            while self.panic.load(Ordering::SeqCst) == 0
+                && ((self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                    || orig_reqs != self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+                    || orig_launched != self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
+                    || (self.parent.is_none()
+                        && (self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                            || world_orig_reqs
+                                != self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                            || world_orig_launched
+                                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))))
+            {
+                orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
+                orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
+                world_orig_reqs = self.world_counters.send_req_cnt.load(Ordering::SeqCst);
+                world_orig_launched = self.world_counters.launched_req_cnt.load(Ordering::SeqCst);
+                // std::thread::yield_now();
+                // self.flush();
+                if std::thread::current().id() != *crate::MAIN_THREAD {
+                    self.scheduler.exec_task()
+                }; //mmight as well do useful work while we wait }
+                if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
+                    println!(
+                        "in team wait_all mype: {:?} cnt: {:?} {:?}",
+                        self.world_pe,
+                        self.team_counters.send_req_cnt.load(Ordering::SeqCst),
+                        self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
+                    );
+                    temp_now = Instant::now();
+                }
+            }
+            if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+                != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
                 || (self.parent.is_none()
-                    && (self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
-                        || world_orig_reqs
-                            != self.world_counters.send_req_cnt.load(Ordering::SeqCst)
-                        || world_orig_launched
-                            != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))))
-        {
-            orig_reqs = self.team_counters.send_req_cnt.load(Ordering::SeqCst);
-            orig_launched = self.team_counters.launched_req_cnt.load(Ordering::SeqCst);
-            world_orig_reqs = self.world_counters.send_req_cnt.load(Ordering::SeqCst);
-            world_orig_launched = self.world_counters.launched_req_cnt.load(Ordering::SeqCst);
-            // std::thread::yield_now();
-            // self.flush();
-            if std::thread::current().id() != *crate::MAIN_THREAD {
-                self.scheduler.exec_task()
-            }; //mmight as well do useful work while we wait }
-            if temp_now.elapsed().as_secs_f64() > config().deadlock_timeout {
+                    && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                        != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
+            {
+                if (self.team_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                    || orig_reqs != self.team_counters.send_req_cnt.load(Ordering::SeqCst)
+                    || orig_launched != self.team_counters.launched_req_cnt.load(Ordering::SeqCst))
+                    || (self.parent.is_none()
+                        && (self.world_counters.outstanding_reqs.load(Ordering::SeqCst) > 0
+                            || world_orig_reqs
+                                != self.world_counters.send_req_cnt.load(Ordering::SeqCst)
+                            || world_orig_launched
+                                != self.world_counters.launched_req_cnt.load(Ordering::SeqCst)))
+                {
+                    continue;
+                }
                 println!(
-                    "in team wait_all mype: {:?} cnt: {:?} {:?}",
+                    "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
                     self.world_pe,
                     self.team_counters.send_req_cnt.load(Ordering::SeqCst),
                     self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
+                    self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
                 );
-                temp_now = Instant::now();
+                RuntimeWarning::UnspawnedTask(
+                    "`wait_all` before all tasks/active messages have been spawned",
+                )
+                .print();
             }
-        }
-        if self.team_counters.send_req_cnt.load(Ordering::SeqCst)
-            != self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
-            || (self.parent.is_none()
-                && self.world_counters.send_req_cnt.load(Ordering::SeqCst)
-                    != self.world_counters.launched_req_cnt.load(Ordering::SeqCst))
-        {
-            println!(
-                "in team wait_all mype: {:?} cnt: {:?} {:?} {:?}",
-                self.world_pe,
-                self.team_counters.send_req_cnt.load(Ordering::SeqCst),
-                self.team_counters.outstanding_reqs.load(Ordering::SeqCst),
-                self.team_counters.launched_req_cnt.load(Ordering::SeqCst)
-            );
-            RuntimeWarning::UnspawnedTask(
-                "`wait_all` before all tasks/active messages have been spawned",
-            )
-            .print();
+            done = true;
         }
         // println!(
         //     "in team wait_all mype: {:?} cnt: {:?} {:?}",

From 36fb3c9220ced1f089633171e2623508f9a8cc14 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 14 Nov 2024 21:52:16 -0800
Subject: [PATCH 115/116] fix task group wait all

---
 src/lamellar_task_group.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/lamellar_task_group.rs b/src/lamellar_task_group.rs
index 5dd72fc4..328f02a3 100644
--- a/src/lamellar_task_group.rs
+++ b/src/lamellar_task_group.rs
@@ -276,7 +276,7 @@ impl LamellarRequestAddResult for TaskGroupMultiAmHandleInner {
             if let Some(waker) = self.wakers.lock().remove(&sub_id) {
                 // println!("0. waker found for sub_id {}", sub_id);
                 waker.wake();
-            } 
+            }
             // else {
             //     println!("0. no waker found for sub_id {}", sub_id);
             // }
@@ -284,8 +284,8 @@ impl LamellarRequestAddResult for TaskGroupMultiAmHandleInner {
             if let Some(waker) = self.wakers.lock().get(&sub_id) {
                 // println!("1. waker found for sub_id {}", sub_id);
                 waker.wake_by_ref();
-            } 
-            // else {
+            }
+            //  else {
             //     println!("1. no waker found for sub_id {}", sub_id);
             // }
         }
@@ -415,9 +415,11 @@ impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
     fn ready_or_set_waker(&mut self, waker: &Waker) -> bool {
         self.launch_am_if_needed();
         let data = self.inner.data.lock();
+        let mut ready = false;
         if let Some(req) = data.get(&self.sub_id) {
-            req.len() == self.inner.arch.num_pes()
-        } else {
+            ready = req.len() == self.inner.arch.num_pes();
+        }
+        if !ready {
             // println!("setting waker for sub_id {}", self.sub_id);
             self.inner.wakers.lock().insert(self.sub_id, waker.clone());
             self.inner
@@ -432,8 +434,8 @@ impl<T: AmDist> LamellarRequest for TaskGroupMultiAmHandle<T> {
                     w.clone_from(waker);
                 })
                 .or_insert(waker.clone());
-            false
         }
+        ready
     }
 
     fn val(&self) -> Self::Output {

From 22f25a415c5dd1d4a9beb7ddd74aa035a7f5e2f0 Mon Sep 17 00:00:00 2001
From: "Ryan D. Friese" <ryan.friese@pnnl.gov>
Date: Thu, 14 Nov 2024 21:55:07 -0800
Subject: [PATCH 116/116] update rofi-sys version

---
 Cargo.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 64312780..33689179 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,8 +13,8 @@ categories = ["asynchronous","concurrency", "network-programming","science"]
 
 [dependencies]
 lamellar-impl = { version = "0.7.0-rc.1", path = "impl" }
-#rofisys = { version ="0.3", optional = true }
-rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
+rofisys = { version ="0.3", optional = true }
+#rofisys = {git = "https://github.com/pnnl/rofi-sys.git", branch = "master", optional = true}
 inventory = "0.3"
 serde = { version = "1.0.147", features = ["derive"] }
 serde_bytes = "0.11.7"